diff --git a/.flake/pkgs/fccf/default.nix b/.flake/pkgs/fccf/default.nix
new file mode 100644
index 0000000000..f792b8606c
--- /dev/null
+++ b/.flake/pkgs/fccf/default.nix
@@ -0,0 +1,54 @@
+{ fetchFromGitHub
+, stdenv
+, cmake
+, pkg-config
+, libclang
+, libllvm
+, lib
+, zlib
+, argparse
+, nlohmann_json
+, fmt
+}:
+
+stdenv.mkDerivation rec {
+  pname = "fccf";
+  version = "03d373fc65e2d7ceeac441ba4bbddfdc25618dff";
+
+  src = fetchFromGitHub {
+    owner = "p-ranav";
+    repo = "fccf";
+    rev = version;
+    sha256 = "sha256-3NdPon5ZfjoGFFgBlb0rzRnfWgSopvAc5Gls2NWHaOE=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+    pkg-config
+  ];
+
+  buildInputs = [
+    libclang
+    libllvm
+    zlib
+    argparse
+    nlohmann_json
+    fmt
+  ];
+
+  patches = [
+    ./json-package-name.patch
+    ./fix-argparse-include.patch
+  ];
+
+  cmakeFlags = [
+    "-DCMAKE_BUILD_TYPE=Release"
+    "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS"
+  ];
+
+  meta = with lib; {
+    description = "A command-line tool that quickly searches through C/C++ source code in a directory based on a search string and prints relevant code snippets that match the query";
+    homepage = "https://github.com/p-ranav/fccf";
+    license = licenses.mit;
+  };
+}
diff --git a/.flake/pkgs/fccf/fix-argparse-include.patch b/.flake/pkgs/fccf/fix-argparse-include.patch
new file mode 100644
index 0000000000..2cb648c1bf
--- /dev/null
+++ b/.flake/pkgs/fccf/fix-argparse-include.patch
@@ -0,0 +1,13 @@
+diff --git a/source/main.cpp b/source/main.cpp
+index 7e131d3..6c05d89 100644
+--- a/source/main.cpp
++++ b/source/main.cpp
+@@ -6,7 +6,7 @@
+ #include <string_view>
+ #include <vector>
+ 
+-#include <argparse.hpp>
++#include <argparse/argparse.hpp>
+ #include <nlohmann/json.hpp>
+ #include "searcher.hpp"
+ #include <unistd.h>
diff --git a/.flake/pkgs/fccf/json-package-name.patch b/.flake/pkgs/fccf/json-package-name.patch
new file mode 100644
index 0000000000..51f6a012cf
--- /dev/null
+++ b/.flake/pkgs/fccf/json-package-name.patch
@@ -0,0 +1,12 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 20bcbbf..923075f 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -48,6 +48,7 @@ FetchContent_MakeAvailable(fmt)
+ 
+ FetchContent_Declare(json
+   URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz
++  FIND_PACKAGE_ARGS NAMES nlohmann_json
+ )
+ FetchContent_MakeAvailable(json)
+ 
diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index a4fff33536..5033e69d65 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,23 +1,4 @@
 images:
-  runs-on-gpu-pinned:
-    platform: "linux"
-    arch: "x64"
-    owner: "135269210855" # runs-on
-    # to find, go to 
-    # https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Images:visibility=public-images;search=:runs-on;v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false
-    name: "runs-on-v2.2-ubuntu22-gpu-x64-20250220122045"
-
-  runs-on-cpu-pinned:
-    platform: "linux"
-    arch: "x64"
-    owner: "135269210855" # runs-on
-    name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045"
-
-  official-ubuntu-ami:
-    platform: "linux"
-    arch: "x64"
-    ami: "ami-0a60b027285c0d4c5"
-
   flexflow-gpu-ci:
     platform: "linux"
     arch: "x64"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9d98fb07dd..799e3069a9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -57,9 +57,9 @@ jobs:
     name: GPU unit tests
     needs: cpu-ci
     runs-on:
-      - runs-on
+      - runs-on=${{ github.run_id }}
       - family=g4dn.xlarge
-      - image=runs-on-gpu-pinned
+      - image=flexflow-gpu-ci
 
     strategy:
       max-parallel: 1
diff --git a/.proj.toml b/.proj.toml
index a06fb53c3a..8eed6166cd 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -2,57 +2,81 @@ project_name = "flexflow"
 testsuite_macro = "FF_TEST_SUITE"
 namespace_name = "FlexFlow"
 header_extension = ".h"
+cuda_launch_cmd = [
+  "nixGL",
+  "--",
+]
 
 [targets.utils]
 type = "lib"
-tests = true
-benchmarks = true
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = true
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.op-attrs]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.kernels]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
 
 [targets.pcg]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.substitutions]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.compiler]
 type = "lib"
-tests = true
-benchmarks = true
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = true
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.substitution-generator]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.local-execution]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.models]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.export-model-arch]
 type = "bin"
+cuda = false
 
 [targets.substitution-to-dot]
 type = "bin"
+cuda = false
 
 # default_build_targets = [
 #   "utils",
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 478ebda318..ef5d6d9d11 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -126,11 +126,16 @@ function(ff_add_test_executable)
     ${FF_TEST_EXEC_NAME}
     ${SRC})
 
+  target_include_directories(
+    ${FF_TEST_EXEC_NAME}
+    PRIVATE
+    ${FF_TEST_EXEC_PRIVATE_INCLUDE})
+
   target_link_libraries(
     ${FF_TEST_EXEC_NAME}
     ${FF_TEST_EXEC_DEPS})
 
-  target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}")
+  target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="cpu-${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}")
 
   define_ff_vars(${FF_TEST_EXEC_NAME})
   ff_set_cxx_properties(${FF_TEST_EXEC_NAME})
diff --git a/flake.lock b/flake.lock
index c991232013..ff6e797d51 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1741679698,
-        "narHash": "sha256-poSOQS/2qImAo/PgRu37pHdOrwAsZEyC8PMM3evFLX4=",
+        "lastModified": 1746157536,
+        "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=",
         "owner": "lockshaw",
         "repo": "proj",
-        "rev": "0de983ff66abea4703f73988d29fc807e2b0a9bd",
+        "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index 77a6c61b7d..5fa48fa3fd 100644
--- a/flake.nix
+++ b/flake.nix
@@ -59,6 +59,7 @@
         bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { };
         ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
+        fccf = pkgs.callPackage ./.flake/pkgs/fccf { };
         rapidcheckFull = pkgs.symlinkJoin {
           name = "rapidcheckFull";
           paths = (with pkgs; [ rapidcheck.out rapidcheck.dev ]);
@@ -162,6 +163,7 @@
               ruff
               jq
               gh
+              expect
             ])
             (with pkgs.python3Packages; [
               gitpython
@@ -179,6 +181,7 @@
             (with self.packages.${system}; [
               ffdb
               hpp2plantuml
+              fccf
             ])
           ];
         };
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 8ccd7c1011..f5d88f102f 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
      src/*.cc
-     src/cuda/cuda_helper.cu
-     src/cuda/ops/*.cu
+     src/cuda/*.cu
      )
 
 add_library(
@@ -30,6 +29,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..f9bef91b25 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -1,25 +1,88 @@
 #ifndef _FLEXFLOW_KERNELS_ACCESSOR_H
 #define _FLEXFLOW_KERNELS_ACCESSOR_H
 
-#include "array_shape.h"
-#include "device.h"
+#include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
-#include "utils/exception.h"
+#include "pcg/device_type.dtg.h"
+#include "utils/containers/transform.h"
 #include "utils/required.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
+nonnegative_int
+    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &,
+                              ArrayShape const &);
+
+class GenericTensorAccessorR {
+public:
+  template <DataType DT>
+  typename data_type_enum_to_class<DT>::type const *get() const {
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    return static_cast<real_type_t<DT> const *>(this->ptr);
+  }
+
+  int32_t const *get_int32_ptr() const;
+  int64_t const *get_int64_ptr() const;
+  float const *get_float_ptr() const;
+  double const *get_double_ptr() const;
+  half const *get_half_ptr() const;
+
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT>
+  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
+  }
+
+  template <DataType DT>
+  real_type_t<DT> const &
+      at(LegionOrdered<nonnegative_int> const &indices) const {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorR::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset.unwrap_nonnegative()];
+  }
+
+public:
+  DataType data_type;
+  ArrayShape shape;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
+
+std::string format_as(GenericTensorAccessorR const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
   typename data_type_enum_to_class<DT>::type *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> *>(this->ptr);
-    } else {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    return static_cast<real_type_t<DT> *>(this->ptr);
   }
 
   int32_t *get_int32_ptr() const;
@@ -28,76 +91,76 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
-public:
-  DataType data_type;
-  ArrayShape shape;
-  req<void *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  GenericTensorAccessorW() = delete;
 
-std::string format_as(GenericTensorAccessorW const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  operator GenericTensorAccessorR() const;
 
-class GenericTensorAccessorR {
-public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type const *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> const *>(this->ptr);
-    } else {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
+  real_type_t<DT> &at(FFOrdered<nonnegative_int> const &indices) {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
   }
 
-  int32_t const *get_int32_ptr() const;
-  int64_t const *get_int64_ptr() const;
-  float const *get_float_ptr() const;
-  double const *get_double_ptr() const;
-  half const *get_half_ptr() const;
+  template <DataType DT>
+  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorW::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T *data_ptr = static_cast<T *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset.unwrap_nonnegative()];
+  }
+
+  template <DataType DT>
+  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
+  }
+
+  template <DataType DT>
+  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) const {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorW::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
+  void *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
-
-std::string format_as(GenericTensorAccessorR const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &);
-int64_t *get_int64_ptr(GenericTensorAccessorW const &);
-float *get_float_ptr(GenericTensorAccessorW const &);
-double *get_double_ptr(GenericTensorAccessorW const &);
-half *get_half_ptr(GenericTensorAccessorW const &);
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::string format_as(GenericTensorAccessorW const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
 static_assert(is_fmtable<req<DataType> const &>::value, "");
 
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type *
     get(GenericTensorAccessorW const &a) {
-  if (a.data_type == DT) {
-    return static_cast<real_type_t<DT> *>(a.ptr);
-  } else {
-    throw mk_runtime_error(
-        fmt::format("Invalid access data type ({} != {})", a.data_type, DT));
-  }
+  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  return static_cast<real_type_t<DT> *>(a.ptr);
 }
 
 template <DataType DT>
@@ -113,12 +176,8 @@ std::vector<real_type_t<DT> *>
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type const *
     get(GenericTensorAccessorR const &a) {
-  if (a.data_type == DT) {
-    return static_cast<real_type_t<DT> const *>(a.ptr);
-  } else {
-    throw mk_runtime_error(
-        fmt::format("Invalid access data type ({} != {})", a.data_type, DT));
-  }
+  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  return static_cast<real_type_t<DT> const *>(a.ptr);
 }
 
 int32_t const *get_int32_ptr(GenericTensorAccessorR const &);
@@ -137,6 +196,21 @@ std::vector<double const *>
 std::vector<half const *>
     get_half_ptrs(std::vector<GenericTensorAccessorR> const &);
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &);
+int64_t *get_int64_ptr(GenericTensorAccessorW const &);
+float *get_float_ptr(GenericTensorAccessorW const &);
+double *get_double_ptr(GenericTensorAccessorW const &);
+half *get_half_ptr(GenericTensorAccessorW const &);
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+
 template <DataType DT>
 std::vector<real_type_t<DT> const *>
     get(std::vector<GenericTensorAccessorR> const &accs) {
@@ -150,12 +224,8 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2);
 
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
@@ -163,8 +233,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
+
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 6500899394..39bad6599c 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
@@ -11,6 +11,8 @@ struct IAllocator {
   virtual void *allocate(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
@@ -18,9 +20,14 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+  void deallocate_tensor(GenericTensorAccessorW const &);
+  void deallocate_tensor(GenericTensorAccessorR const &);
+
   void *allocate(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type
diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/kernels/include/kernels/array_coord.struct.toml
new file mode 100644
index 0000000000..8ce121f2bf
--- /dev/null
+++ b/lib/kernels/include/kernels/array_coord.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "ArrayCoord"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "rapidcheck",
+  "json",
+]
+
+includes = [
+  "op-attrs/ff_ordered/ff_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "ff_ordered"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 57498ee466..25ef8116f2 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 
+#include "kernels/array_coord.dtg.h"
 #include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
@@ -15,9 +16,7 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
-  ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<nonnegative_int> const &);
+  explicit ArrayShape(LegionOrdered<nonnegative_int> const &dims);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
@@ -46,24 +45,40 @@ struct ArrayShape {
   std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
   std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(ff_dim_t const &start,
+                       std::optional<ff_dim_t> const &end) const;
+
+  ArrayShape sub_shape(legion_dim_t const &start,
+                       std::optional<legion_dim_t> const &end) const;
 
 public:
   LegionOrdered<nonnegative_int> dims;
 
 private:
   std::tuple<decltype(dims) const &> tie() const;
+
+  friend ::std::hash<ArrayShape>;
 };
 
+std::string format_as(ArrayShape const &);
+std::ostream &operator<<(std::ostream &, ArrayShape const &);
+
 nonnegative_int get_volume(ArrayShape const &);
 
+ArrayShape array_shape_from_tensor_shape(TensorShape const &);
 TensorShape get_tensor_shape(ArrayShape const &, DataType);
 
-std::string format_as(ArrayShape const &);
-std::ostream &operator<<(std::ostream &, ArrayShape const &);
+std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &);
 
 } // namespace FlexFlow
 
+namespace std {
+
+template <>
+struct hash<::FlexFlow::ArrayShape> {
+  size_t operator()(::FlexFlow::ArrayShape const &) const;
+};
+
+} // namespace std
+
 #endif
diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
index eb5a1b8198..b3c77d3430 100644
--- a/lib/kernels/include/kernels/attention_kernels.h
+++ b/lib/kernels/include/kernels/attention_kernels.h
@@ -1,7 +1,6 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/device.h"
 #include "kernels/ff_handle.h"
@@ -64,8 +63,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +103,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
index bfd72647b0..8b67f564d2 100644
--- a/lib/kernels/include/kernels/batch_matmul_kernels.h
+++ b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -1,13 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index f2ca17f429..9bb2753a12 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -1,15 +1,13 @@
 #ifndef _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H
 #define _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_per_device_state.dtg.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include <memory>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchNorm {
+namespace FlexFlow::Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      BatchNormPerDeviceState const &per_device_state,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
@@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace BatchNorm
-} // namespace Kernels
-} // namespace FlexFlow
-
+} // namespace FlexFlow::Kernels::BatchNorm
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 96f9aadd52..5ec4cb3975 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -1,29 +1,19 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+                    GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
new file mode 100644
index 0000000000..343ba253d9
--- /dev/null
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
+
+} // namespace FlexFlow::Kernels::Cast
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
index eb263e0734..c87465a01f 100644
--- a/lib/kernels/include/kernels/combine_kernels.h
+++ b/lib/kernels/include/kernels/combine_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
new file mode 100644
index 0000000000..75fdd56498
--- /dev/null
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Combine
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
index a44affc1f2..1e3c55bf59 100644
--- a/lib/kernels/include/kernels/concat_kernels.h
+++ b/lib/kernels/include/kernels/concat_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
 
 #endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index cfc64f963d..3b7c0672df 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
 #include "utils/visitable.h"
@@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
                                              bwdFilterAlgo,
                                              bwdDataAlgo);
 
-namespace Kernels {
-namespace Conv2D {
+namespace Kernels::Conv2D {
 
 Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
                      std::optional<Activation> activation);
 
-} // namespace Conv2D
-} // namespace Kernels
+} // namespace Kernels::Conv2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
new file mode 100644
index 0000000000..81fd59dafb
--- /dev/null
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &,
+                                               Allocator &cpu_allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &,
+                                               Allocator &cpu_allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h
index e83fc3325d..50ca66a820 100644
--- a/lib/kernels/include/kernels/datatype_dispatch.h
+++ b/lib/kernels/include/kernels/datatype_dispatch.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 
-#include "accessor.h"
+#include "op-attrs/datatype.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -33,7 +34,7 @@ struct DataTypeDispatch1 {
     template <typename... Args,
               typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                   std::declval<Args>()...))>
-    Out operator()(Args... args) const {
+    Out operator()(Args &&...args) const {
       return F<DT>{}(std::forward<Args>(args)...);
     }
   };
@@ -41,7 +42,7 @@ struct DataTypeDispatch1 {
   template <typename... Args,
             typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                 std::declval<Args>()...))>
-  Out operator()(DataType data_type, Args... args) {
+  Out operator()(DataType data_type, Args &&...args) {
     return dispatch<Type1Dispatch>(data_type, std::forward<Args>(args)...);
   }
 };
@@ -54,13 +55,13 @@ struct DataTypeDispatch2 {
     template <DataType OT>
     struct OutputType {
       template <typename... Args>
-      void operator()(Args... args) const {
+      void operator()(Args &&...args) const {
         F<IT, OT>{}(std::forward<Args>(args)...);
       }
     };
 
     template <typename... Args>
-    void operator()(DataType output_type, Args... args) const {
+    void operator()(DataType output_type, Args &&...args) const {
       dispatch<OutputType>(output_type, std::forward<Args>(args)...);
     }
   };
@@ -68,7 +69,7 @@ struct DataTypeDispatch2 {
   template <typename... Args>
   void operator()(DataType input_data_type,
                   DataType output_data_type,
-                  Args... args) {
+                  Args &&...args) {
     dispatch<InputType>(
         input_data_type, output_data_type, std::forward<Args>(args)...);
   }
diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h
index c0e503be5b..2cc6dd60a3 100644
--- a/lib/kernels/include/kernels/dropout_kernels.h
+++ b/lib/kernels/include/kernels/dropout_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include <cstddef>
 
@@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState,
                                              reserveSpaceSize,
                                              dropoutStateSize);
 
-namespace Kernels {
-namespace Dropout {
+namespace Kernels::Dropout {
 
 DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                   float rate,
@@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator,
                     ffDropoutDescriptor_t dropoutDesc,
                     void *dropoutStates);
 
-} // namespace Dropout
-} // namespace Kernels
+} // namespace Kernels::Dropout
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h
index 41447e98e6..fd596f2ccf 100644
--- a/lib/kernels/include/kernels/element_binary_kernels.h
+++ b/lib/kernels/include/kernels/element_binary_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
 #include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.h"
 
@@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState,
                                              opDesc,
                                              reduceAddDesc);
 
-namespace Kernels {
-namespace ElementBinary {
+namespace Kernels::ElementBinary {
 
 ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                         OperatorType op_type,
@@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream,
                      bool broadcast_inputRHS,
                      PerDeviceFFHandle handle);
 
-} // namespace ElementBinary
-} // namespace Kernels
+} // namespace Kernels::ElementBinary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 8c6864b2d9..0257b3b4a6 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/ops/element_unary.h"
 #include <cstddef>
@@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
                           outputTensor,
                           actiDesc);
 
-namespace Kernels {
-namespace ElementUnary {
+namespace Kernels::ElementUnary {
 
 ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
                                        ArrayShape const &output_shape,
@@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace ElementUnary
-} // namespace Kernels
+} // namespace Kernels::ElementUnary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index 06582ca1d5..f51a730314 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -1,13 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "op-attrs/ops/embedding.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
@@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream,
                     int out_dim,
                     int batch_size);
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
@@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p);
 template <typename TD>
 __global__ void rand_generate_int(TD *ptr, size_t size, TD p);
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
 
 #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h
index 179ce41cbf..31b3296a98 100644
--- a/lib/kernels/include/kernels/ff_handle.h
+++ b/lib/kernels/include/kernels/ff_handle.h
@@ -5,7 +5,7 @@
 #include <nccl.h>
 #endif
 
-#include "device.h"
+#include "kernels/device.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index 3e600c48de..b2b1164f92 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -1,23 +1,20 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Flat {
+namespace FlexFlow::Kernels::Flat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
                     float *output_ptr);
+
 void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr);
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr);
 
-} // namespace Flat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Flat
 
 #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
diff --git a/lib/kernels/include/kernels/format_accessor_contents.h b/lib/kernels/include/kernels/format_accessor_contents.h
new file mode 100644
index 0000000000..b50cffbbef
--- /dev/null
+++ b/lib/kernels/include/kernels/format_accessor_contents.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+
+std::string format_accessor_r_contents(GenericTensorAccessorR const &);
+std::string format_accessor_w_contents(GenericTensorAccessorW const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 13bf4b898a..8cbc7e457e 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -15,23 +15,21 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
                                              handle,
                                              legion_dim);
 
-namespace Kernels {
-namespace Gather {
+namespace Kernels::Gather {
 
 void forward_kernel(ffStream_t stream,
-                    GatherPerDeviceState const &m,
+                    GatherPerDeviceState const &per_device_state,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GatherPerDeviceState const &m,
+                     GatherPerDeviceState const &per_device_state,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Gather
-} // namespace Kernels
+} // namespace Kernels::Gather
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index be13d32879..10cf2fb14b 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
 namespace FlexFlow {
@@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
                                              bias,
                                              data_type);
 
-namespace Kernels {
-namespace LayerNorm {
+namespace Kernels::LayerNorm {
 
 // todo: this may have some problem.
 LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
@@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &gamma_grad,
                      GenericTensorAccessorW const &beta_grad);
 
-} // namespace LayerNorm
-} // namespace Kernels
+} // namespace Kernels::LayerNorm
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 7b9b9c455c..947bbd00bb 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -2,7 +2,13 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H
 
 #include "kernels/legion_dim_t.dtg.h"
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/set_of.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -11,7 +17,10 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
 template <typename T>
-using LegionOrdered = DimOrdered<legion_dim_t, T>;
+std::set<legion_dim_t> key_range(LegionOrdered<T> const &d) {
+  return transform(set_of(nonnegative_range(num_elements(d))),
+                   [](nonnegative_int i) { return legion_dim_t{i}; });
+}
 
 template <typename T>
 FFOrdered<T>
@@ -25,17 +34,6 @@ LegionOrdered<T>
   return LegionOrdered<T>(ff_ordered.rbegin(), ff_ordered.rend());
 }
 
-template <typename T>
-std::string format_as(LegionOrdered<T> const &v) {
-  std::vector<T> as_vec(v.cbegin(), v.cend());
-  return fmt::format("<legion_ordered {}>", as_vec);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &s, LegionOrdered<T> const &v) {
-  return (s << fmt::to_string(v));
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
new file mode 100644
index 0000000000..ad8b3bad6d
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
@@ -0,0 +1,197 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H
+
+#include "kernels/legion_dim_t.dtg.h"
+#include "utils/fmt/vector.h"
+#include "utils/stack_vector/stack_vector.h"
+
+namespace FlexFlow {
+
+template <typename T>
+struct LegionOrdered {
+  LegionOrdered() {}
+
+  LegionOrdered(std::initializer_list<T> const &l)
+      : contents(l.begin(), l.end()) {}
+
+  LegionOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  LegionOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  LegionOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(legion_dim_t idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T &at(legion_dim_t idx) {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](legion_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](legion_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(legion_dim_t const &idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return raw < this->contents.size();
+  }
+
+  bool operator==(LegionOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(LegionOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.rend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<LegionOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
+template <typename T>
+auto operator<(LegionOrdered<T> const &lhs, LegionOrdered<T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
+
+template <typename T>
+std::string format_as(LegionOrdered<T> const &v) {
+  std::vector<T> as_vec(v.cbegin(), v.cend());
+  return fmt::format("<legion_ordered {}>", as_vec);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, LegionOrdered<T> const &v) {
+  return (s << fmt::to_string(v));
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <typename T>
+struct adl_serializer<::FlexFlow::LegionOrdered<T>> {
+  static ::FlexFlow::LegionOrdered<T> from_json(nlohmann::json const &j) {
+    return {j.template get<std::vector<T>>()};
+  }
+
+  static void to_json(nlohmann::json &j,
+                      ::FlexFlow::LegionOrdered<T> const &x) {
+    j = std::vector<T>{x.cbegin(), x.cend()};
+  }
+};
+} // namespace nlohmann
+
+namespace std {
+
+template <typename T>
+struct hash<::FlexFlow::LegionOrdered<T>> {
+  size_t operator()(::FlexFlow::LegionOrdered<T> const &t) const {
+    static_assert(::FlexFlow::is_hashable<T>::value,
+                  "Elements must be hashable");
+
+    return get_std_hash(t.contents);
+  }
+};
+
+} // namespace std
+
+namespace rc {
+
+template <typename T>
+struct Arbitrary<::FlexFlow::LegionOrdered<T>> {
+  static Gen<::FlexFlow::LegionOrdered<T>> arbitrary() {
+    return gen::construct<::FlexFlow::LegionOrdered<T>>(
+        gen::arbitrary<::FlexFlow::stack_vector<T, MAX_TENSOR_DIM>>());
+  }
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_ordered/slice.h b/lib/kernels/include/kernels/legion_ordered/slice.h
new file mode 100644
index 0000000000..6980c0d9ec
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/slice.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H
+
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/containers/slice.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename T>
+LegionOrdered<T> slice(LegionOrdered<T> const &d,
+                       legion_dim_t const &start,
+                       std::optional<legion_dim_t> const &end) {
+  int raw_start = start.value.unwrap_nonnegative();
+  std::optional<int> raw_end = transform(
+      end, [](legion_dim_t const &i) { return i.value.unwrap_nonnegative(); });
+
+  return LegionOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_ordered/transform.h b/lib/kernels/include/kernels/legion_ordered/transform.h
new file mode 100644
index 0000000000..55cc1ff1ea
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/transform.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H
+
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/vector_transform.h"
+
+namespace FlexFlow {
+
+template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
+LegionOrdered<Out> transform(LegionOrdered<T> const &d, F &&f) {
+  return LegionOrdered<Out>{vector_transform(vector_of(d), f)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 3128e39fd0..21d84c2567 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
+#include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
 
@@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState,
                                              weight_type,
                                              output_type);
 
-namespace Kernels {
-namespace Linear {
+namespace Kernels::Linear {
 
 LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  float *one_ptr,
@@ -51,29 +50,28 @@ bool use_activation(Activation activation);
 
 void forward_kernel(ffStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *filter_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *filter_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size);
 
 void backward_kernel(ffStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size);
 
-} // namespace Linear
-} // namespace Kernels
+} // namespace Kernels::Linear
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
similarity index 74%
rename from lib/local-execution/include/local-execution/local_cpu_allocator.h
rename to lib/kernels/include/kernels/local_cpu_allocator.h
index d1e81facf2..9653dcf00e 100644
--- a/lib/local-execution/include/local-execution/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -1,3 +1,6 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H
+
 #include "kernels/allocation.h"
 #include <unordered_set>
 
@@ -12,6 +15,8 @@ struct LocalCPUAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
 };
@@ -20,3 +25,5 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
 Allocator create_local_cpu_memory_allocator();
 
 } // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index 18a4b6e78a..b8e0540974 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_set<void *> ptrs;
 };
diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h
index 2f690b2eb3..576edb0ffa 100644
--- a/lib/kernels/include/kernels/managed_ff_stream.h
+++ b/lib/kernels/include/kernels/managed_ff_stream.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
 #define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -19,6 +19,9 @@ struct ManagedFFStream {
 
   ffStream_t const &raw_stream() const;
 
+private:
+  void cleanup();
+
 private:
   ffStream_t *stream;
 };
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..9bd9370685 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,10 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle() = delete;
+
+  ManagedPerDeviceFFHandle(size_t workSpaceSize,
+                           bool allowTensorOpMathConversion);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
@@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle {
 
   PerDeviceFFHandle const &raw_handle() const;
 
+private:
+  void cleanup();
+
 private:
   PerDeviceFFHandle *handle;
 };
diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h
index e4660808b9..430608db55 100644
--- a/lib/kernels/include/kernels/metrics_kernels.h
+++ b/lib/kernels/include/kernels/metrics_kernels.h
@@ -1,25 +1,24 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 
-#include "perf_metrics.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
-void update_metrics_sparse_label_kernel(ffStream_t,
-                                        MetricsAttrs const &,
-                                        float const *logit_ptr,
-                                        int const *label_ptr,
-                                        int num_samples,
-                                        int num_classes,
-                                        PerfMetrics &perf_zc);
-void update_metrics_label_kernel(ffStream_t,
-                                 MetricsAttrs const &,
-                                 float const *logit_ptr,
-                                 float const *label_ptr,
-                                 int num_samples,
-                                 int num_classes,
-                                 PerfMetrics &perf_zc);
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const &me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc);
 
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const &me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h
index b8a6784676..042911d172 100644
--- a/lib/kernels/include/kernels/nccl.h
+++ b/lib/kernels/include/kernels/nccl.h
@@ -23,15 +23,11 @@ struct ncclUniqueId {};
 struct ncclComm_t {};
 #endif
 
-namespace FlexFlow {
-namespace Kernels {
-namespace NCCL {
+namespace FlexFlow::Kernels::NCCL {
 
 ncclUniqueId generate_unique_id();
 ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank);
 
-} // namespace NCCL
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::NCCL
 
 #endif
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 9ca6bf8e2b..d552831c78 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -16,15 +17,18 @@ void sgd_ps_update_task_gpu(ffStream_t,
                             float *weight_ptr,
                             float *sgd_v_ptr);
 
+#ifdef FF_USE_NCCL
 void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
                               float *sgd_v_ptr);
+#endif
 
 void adam_ps_update_task_gpu(ffStream_t,
                              float alpha_t,
@@ -33,9 +37,11 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float weight_decay,
                              float epsilon,
                              float const *weight_grad_ptr,
-                             float *adam_m_ptr,
+                             size_t size,
+                             int num_replicas,
+                             float *weight_ptr,
                              float *adam_v_ptr,
-                             float *weight_ptr);
+                             float *adam_m_ptr);
 
 void adam_nccl_update_task_gpu(ffStream_t,
                                float alpha_t,
@@ -45,9 +51,10 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float epsilon,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
-                               float *adam_m_ptr,
+                               size_t size,
+                               float *weight_ptr,
                                float *adam_v_ptr,
-                               float *weight_ptr);
+                               float *adam_m_ptr);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
index 64ef1a1352..aa3a7a1ef7 100644
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ b/lib/kernels/include/kernels/partition_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -13,8 +13,7 @@ struct RepartitionPerDeviceState {
 
 FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type);
 
-namespace Kernels {
-namespace Repartition {
+namespace Kernels::Repartition {
 
 RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                       DataType data_type);
@@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &output_grad,
-                     GenericTensorAccessorR const &input_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace Repartition
-} // namespace Kernels
+} // namespace Kernels::Repartition
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml
rename to lib/kernels/include/kernels/per_device_op_state.variant.toml
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 798c0507f8..76aa07d0a4 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
 #include "op-attrs/ops/pool_2d.h"
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState,
                                              poolDesc,
                                              relu);
 
-namespace Kernels {
-namespace Pool2D {
+namespace Kernels::Pool2D {
 
 Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr);
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr);
 
-} // namespace Pool2D
-} // namespace Kernels
+} // namespace Kernels::Pool2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 655d540685..7c4145c426 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_PROFILING_H
 #define _FLEXFLOW_KERNELS_PROFILING_H
 
-#include "device.h"
+#include "kernels/device.h"
 #include "kernels/profiling_settings.dtg.h"
 #include "utils/visitable.h"
 
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 4287472875..10e8e4393b 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 
-#include "array_shape.h"
-#include "device.h"
-#include "ff_handle.h"
+#include "kernels/array_shape.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 #include "op-attrs/operator_type.dtg.h"
 
 namespace FlexFlow {
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState,
                     op_type,
                     reduction_size);
 
-namespace Kernels {
-namespace Reduce {
+namespace Kernels::Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
                                  OperatorType const &,
@@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream,
                      ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
-} // namespace Reduce
-} // namespace Kernels
+} // namespace Kernels::Reduce
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
index fb3baf215c..08f73cd9ab 100644
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ b/lib/kernels/include/kernels/reduction_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reduction {
+namespace FlexFlow::Kernels::Reduction {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream,
                     size_t num_replicas);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reduction
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reduction
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
index 409fc81f44..0b113868ee 100644
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ b/lib/kernels/include/kernels/replicate_kernels.h
@@ -1,24 +1,20 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas);
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
 
 #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
new file mode 100644
index 0000000000..2a2eaa5eb6
--- /dev/null
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas);
+
+} // namespace FlexFlow::Kernels::Replicate
+
+#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index a83caa6bea..88c11d2fb0 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "utils/required_core.h"
 
 namespace FlexFlow {
@@ -13,8 +13,7 @@ struct ReshapePerDeviceState {
 
 FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-namespace Kernels {
-namespace Reshape {
+namespace Kernels::Reshape {
 
 ReshapePerDeviceState init_kernel(DataType data_type);
 
@@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      ReshapePerDeviceState const &per_device_state,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reshape
-} // namespace Kernels
+} // namespace Kernels::Reshape
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h
index 42a83ae219..768707175c 100644
--- a/lib/kernels/include/kernels/reverse_kernels.h
+++ b/lib/kernels/include/kernels/reverse_kernels.h
@@ -1,30 +1,21 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/reverse_kernels_cpu.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 void forward_kernel(ffStream_t stream,
-                    float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size);
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &);
 
 void backward_kernel(ffStream_t stream,
-                     float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size);
+                     GenericTensorAccessorR const &output_accessor,
+                     GenericTensorAccessorW &input_accessor,
+                     ReverseAttrs const &);
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
new file mode 100644
index 0000000000..ec82000f8f
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Reverse {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor,
+                         ReverseAttrs const &);
+
+} // namespace FlexFlow::Kernels::Reverse
+
+#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h
new file mode 100644
index 0000000000..766d70b915
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_params.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
+
+#include "kernels/array_shape.h"
+#include "kernels/reverse_kernels_params.dtg.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow {
+
+ReverseKernelsParams
+    compute_reverse_kernels_params(ArrayShape const &output_shape,
+                                   ReverseAttrs const &attrs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
new file mode 100644
index 0000000000..a5dbd750bc
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "ReverseKernelsParams"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
+[[fields]]
+name = "num_out_blks"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "reverse_dim_size"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "in_blk_size"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "out_size"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 061230ec52..60101578e3 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState {
 
 FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
 
-namespace Kernels {
-namespace Softmax {
+namespace Kernels::Softmax {
 
 SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                   int dim,
@@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream,
                     float *output_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements);
 
-} // namespace Softmax
-} // namespace Kernels
+} // namespace Kernels::Softmax
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h
index 36434d4be8..3b580f94be 100644
--- a/lib/kernels/include/kernels/split_kernels.h
+++ b/lib/kernels/include/kernels/split_kernels.h
@@ -1,12 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Split {
+namespace FlexFlow::Kernels::Split {
 void forward_kernel(ffStream_t stream,
                     float **out_ptrs,
                     float const *in_ptr,
@@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t num_blks,
                      int numOutputs);
 
-} // namespace Split
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Split
 
 #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h
index ae1c739f6c..085594d57f 100644
--- a/lib/kernels/include/kernels/topk_kernels.h
+++ b/lib/kernels/include/kernels/topk_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -12,8 +12,7 @@ struct TopKPerDeviceState {
 
 FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted);
 
-namespace Kernels {
-namespace TopK {
+namespace Kernels::TopK {
 
 TopKPerDeviceState init_kernel(bool sorted);
 
@@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream,
                      int length,
                      int k);
 
-} // namespace TopK
-} // namespace Kernels
+} // namespace Kernels::TopK
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 0f1cc2ae61..776370dcbd 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -1,15 +1,14 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
 #include <vector>
 
 namespace FlexFlow {
 
-namespace Kernels {
-namespace Transpose {
+namespace Kernels::Transpose {
 
 void forward_kernel(cudaStream_t stream,
                     TransposeAttrs const &attrs,
@@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposeAttrs const &attrs,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad);
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad);
 
-} // namespace Transpose
-} // namespace Kernels
+} // namespace Kernels::Transpose
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
deleted file mode 100644
index 27b7eb390d..0000000000
--- a/lib/kernels/src/accessor.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-#include "kernels/accessor.h"
-
-namespace FlexFlow {
-
-int32_t *GenericTensorAccessorW::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t *GenericTensorAccessorW::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float *GenericTensorAccessorW::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double *GenericTensorAccessorW::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half *GenericTensorAccessorW::get_half_ptr() const {
-  return this->get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorW const &a) {
-  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float const *GenericTensorAccessorR::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double const *GenericTensorAccessorR::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half const *GenericTensorAccessorR::get_half_ptr() const {
-  return get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorR const &a) {
-  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float *get_float_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double *get_double_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half *get_half_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half *>
-    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float const *get_float_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double const *get_double_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half const *get_half_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t const *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t const *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float const *>
-    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double const *>
-    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half const *>
-    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-GenericTensorAccessorR read_only_accessor_from_write_accessor(
-    GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
-}
-
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
-  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
deleted file mode 100644
index d666592e77..0000000000
--- a/lib/kernels/src/allocation.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "kernels/allocation.h"
-#include "op-attrs/tensor_shape.h"
-
-namespace FlexFlow {
-
-void *Allocator::allocate(size_t mem_size) {
-  return this->i_allocator->allocate(mem_size);
-}
-
-void Allocator::deallocate(void *ptr) {
-  this->i_allocator->deallocate(ptr);
-}
-
-GenericTensorAccessorW
-    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr =
-      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
-  return {tensor_shape.data_type, tensor_shape, ptr};
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc
new file mode 100644
index 0000000000..cdd57b8947
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/cast_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+template <typename IDT, typename ODT>
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
+  for (size_t i = 0; i < volume; ++i) {
+    output[i] = static_cast<ODT>(input[i]);
+  }
+}
+
+template <typename IDT, typename ODT>
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+  for (size_t i = 0; i < volume; i++) {
+    output[i] = static_cast<ODT>(input[i]) + beta * output[i];
+  }
+}
+
+template <DataType IDT, DataType ODT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
+    cpu_cast_backward(
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch2<CPUForwardKernel>{}(
+      input.data_type, output.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      output.data_type, input.data_type, output, input);
+}
+
+} // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc
new file mode 100644
index 0000000000..577984f21a
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/combine_kernels.cc
@@ -0,0 +1,39 @@
+#include "kernels/combine_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.get_volume().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
+    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
+    for (int i = 0; i < num_elements; ++i) {
+      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
+      input_grad.data_type, output_grad, input_grad);
+}
+
+} // namespace FlexFlow::Kernels::Combine
diff --git a/lib/kernels/src/cpu/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc
similarity index 100%
rename from lib/kernels/src/cpu/initializer_kernels.cc
rename to lib/kernels/src/cpu/ops/initializer_kernels.cc
diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc
new file mode 100644
index 0000000000..798a4ea8c7
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/replicate_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.num_elements().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW &input,
+                  nonnegative_int num_elements,
+                  nonnegative_int num_replicas) {
+    using T = real_type_t<DT>;
+
+    for (nonnegative_int i : nonnegative_range(num_elements)) {
+      T cur_sum = 0;
+      for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) {
+        cur_sum += output.at<DT>(LegionOrdered{replica_idx, i});
+      }
+      input.at<DT>(LegionOrdered{i}) = cur_sum;
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas) {
+  nonnegative_int num_elements = input.shape.num_elements();
+  DataTypeDispatch1<CPUBackwardKernel>{}(input.data_type,
+                                         output,
+                                         input,
+                                         num_elements,
+                                         nonnegative_int{num_replicas});
+}
+
+} // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc
new file mode 100644
index 0000000000..4d9eb8cc09
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc
@@ -0,0 +1,46 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <vector>
+
+namespace FlexFlow::Kernels::Reverse {
+
+template <DataType DT>
+struct CPUReverseForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  ReverseAttrs const &attrs) {
+    nonnegative_int reverse_axis_size = input.shape.at(attrs.axis);
+
+    for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) {
+      nonnegative_int input_reverse_axis_coord =
+          input_coord.ff_ordered.at(attrs.axis);
+
+      ArrayCoord output_coord = input_coord;
+      output_coord.ff_ordered.at(attrs.axis) =
+          nonnegative_int{reverse_axis_size.unwrap_nonnegative() -
+                          input_reverse_axis_coord.unwrap_nonnegative() - 1};
+
+      output.at<DT>(output_coord.ff_ordered) =
+          input.at<DT>(input_coord.ff_ordered);
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &attrs) {
+
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      input_accessor.data_type, input_accessor, output_accessor, attrs);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor,
+                         GenericTensorAccessorW &input_grad_accessor,
+                         ReverseAttrs const &attrs) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_grad_accessor.data_type,
+                                               output_grad_accessor,
+                                               input_grad_accessor,
+                                               attrs);
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 66388c0ec8..86b2d8a437 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "utils/containers/reversed.h"
 
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }
@@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) {
 }
 
 template <typename DT>
-__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
+__global__ void copy_kernel(DT *dst, const DT *src, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     dst[i] = src[i];
   }
@@ -281,11 +281,11 @@ template __global__ void
     add_kernel<bool>(bool *dst, bool const *src, unsigned long size);
 
 template __global__ void
-    copy_kernel<float>(float *dst, float const *src, coord_t size);
+    copy_kernel<float>(float *dst, float const *src, size_t size);
 template __global__ void
-    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, coord_t size);
+    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, size_t size);
 template __global__ void
-    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
+    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, size_t size);
 
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index e6a614ba70..cb84f0e777 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -13,16 +13,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/embedding_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 
 void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
@@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
 
 void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       ptr, size, p);
 }
 
-template <typename TI, typename TD>
-__global__ void embed_forward_no_aggr(
-    TI const *input, TD *output, TD const *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_forward_with_aggr(TI const *input,
-                                        TD *output,
-                                        TD const *embed,
-                                        int out_dim,
-                                        int in_dim,
-                                        int batch_size,
-                                        std::optional<AggregateOp> aggr);
-template <typename TI, typename TD>
-__global__ void embed_backward_no_aggr(
-    TI const *input, TD const *output, TD *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_backward_with_aggr(TI const *input,
-                                         TD const *output,
-                                         TD *embed,
-                                         int out_dim,
-                                         int in_dim,
-                                         int batch_size,
-                                         std::optional<AggregateOp> aggr);
-
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int32_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int64_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int32_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int64_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int32_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int64_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int32_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
 }
 
 template <>
-__global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int64_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int32_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int64_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int32_t, half>(int32_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int32_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -301,14 +277,13 @@ __global__ void
 }
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int64_t, half>(int64_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int64_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -351,35 +326,229 @@ struct ForwardKernel {
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(weight.data_type == DataType::HALF ||
-           weight.data_type == DataType::FLOAT ||
-           weight.data_type == DataType::DOUBLE);
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
 
+template <>
+struct ForwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
                        out_dim,
                        batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -388,39 +557,229 @@ template <DataType TI, DataType TD>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
                   GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &weight_grad,
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(output.data_type == DataType::HALF ||
-           output.data_type == DataType::FLOAT ||
-           output.data_type == DataType::DOUBLE);
     if (!aggr.has_value()) {
-      embed_backward_no_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
                        out_dim,
                        batch_size);
     } else {
-      embed_backward_with_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
     }
   }
 };
@@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
-  DataTypeDispatch2<BackwardKernel>{}(input_data_type,
-                                      output_data_type,
+  DataTypeDispatch2<BackwardKernel>{}(output_data_type,
+                                      input_data_type,
                                       stream,
                                       aggr,
-                                      input,
                                       output,
+                                      input,
                                       weight_grad,
                                       in_dim,
                                       out_dim,
                                       batch_size);
 }
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu
index 6c22efda21..2fccf4b48f 100644
--- a/lib/kernels/src/cuda/loss_function_kernels.cu
+++ b/lib/kernels/src/cuda/loss_function_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/loss_function_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 2e037eb472..54ecd076f6 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -13,17 +13,42 @@
  * limitations under the License.
  */
 
-#include "flexflow/model.h"
-#include "flexflow/utils/cuda_helper.h"
+#include "internal/device.h"
+#include "kernels/metrics_kernels.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
+struct CUDAPerfMetrics {
+  int train_all;
+  int train_correct;
+  float cce_loss;
+  float sparse_cce_loss;
+  float mse_loss;
+  float rmse_loss;
+  float mae_loss;
+  double start_time;
+  double current_time;
+
+  CUDAPerfMetrics() = delete;
+  CUDAPerfMetrics(PerfMetrics const &perf)
+      : train_all(perf.train_all),
+        train_correct(perf.train_correct.value_or(-1)),
+        cce_loss(perf.cce_loss.value_or(-1)),
+        sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)),
+        mse_loss(perf.mse_loss.value_or(-1)),
+        rmse_loss(perf.rmse_loss.value_or(-1)),
+        mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time),
+        current_time(perf.current_time) {}
+};
+
 float const LOG_MIN_VALUE = 0.00000001f;
 
 __global__ void update_metrics_sparse_label_kernel(float const *logits,
                                                    int const *labels,
-                                                   PerfMetrics *perf,
-                                                   const Metrics metrics,
+                                                   CUDAPerfMetrics *perf,
+                                                   const MetricsAttrs metrics,
                                                    int num_samples,
                                                    int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits,
 
 __global__ void update_metrics_label_kernel(float const *logits,
                                             float const *labels,
-                                            PerfMetrics *perf,
-                                            const Metrics metrics,
+                                            CUDAPerfMetrics *perf,
+                                            const MetricsAttrs metrics,
                                             int num_samples,
                                             int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits,
   }
 }
 
-void Metrics::update_metrics_sparse_label_kernel_wrapper(
-    float const *logit_ptr,
-    int const *label_ptr,
-    Metrics const *me,
-    int num_effective_samples,
-    int num_classes,
-    PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const &me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper(
                                        CUDA_NUM_THREADS,
                                        0,
                                        stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
-void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr,
-                                                  float const *label_ptr,
-                                                  Metrics const *me,
-                                                  int num_samples,
-                                                  int num_classes,
-                                                  PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const &me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
 }; // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu
index 38c32ad9e4..e5bdb6f21d 100644
--- a/lib/kernels/src/cuda/ops/attention_kernels.cu
+++ b/lib/kernels/src/cuda/ops/attention_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/attention_kernels.h"
 #include "kernels/device.h"
 
diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
index eb23514c5f..348eed9f0c 100644
--- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/batch_matmul_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 4e153a028e..ceb3a1b3d9 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_kernels.h"
 #include "kernels/ff_handle.h"
@@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index fe7aec68b9..f3ea6db660 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/cast_kernels.h"
 #include "kernels/datatype_dispatch.h"
 
@@ -50,30 +50,26 @@ struct ForwardKernel {
 template <DataType IDT, DataType ODT>
 struct BackwardKernel {
   void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
-        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
+                    GenericTensorAccessorW const &output) {
   DataTypeDispatch2<ForwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      input.data_type, output.data_type, stream, input, output);
 }
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
   DataTypeDispatch2<BackwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      output.data_type, input.data_type, stream, output, input);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 7cc67ceed8..08cc343fd2 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/combine_kernels.h"
 #include "kernels/datatype_dispatch.h"
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 2715ff16e9..37dbbe12f8 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -13,50 +13,58 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/concat_kernels.h"
 #include <cassert>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+  legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims());
+  assert(legion_axis.value < shape.num_dims());
+  if (legion_axis.value == 0_n) {
+    legion_axis.value = 1_n;
+  }
+  blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis)
                  .num_elements()
                  .unwrap_nonnegative();
-  num_blocks =
-      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
+  num_blocks = shape.sub_shape(legion_axis, std::nullopt)
+                   .num_elements()
+                   .unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
                     GenericTensorAccessorW const &output,
                     std::vector<GenericTensorAccessorR> const &inputs,
                     ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = inputs.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
+  assert(inputs.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    copy_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (GenericTensorAccessorR const &input : inputs) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_copy =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    copy_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                        CUDA_NUM_THREADS,
                        0,
                        stream>>>(output.get_float_ptr() + offset,
-                                 inputs[i].get_float_ptr(),
-                                 num_blocks,
+                                 input.get_float_ptr(),
+                                 blocks_to_copy,
                                  output_blk_size,
-                                 input_blk_sizes[i]);
-    offset += input_blk_sizes[i];
+                                 input_blk_size);
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
@@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = input_grads.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
-
+  assert(input_grads.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    ArrayShape shape = input_grads[i].shape;
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    add_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (auto &input_grad : input_grads) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input_grad.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_add =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    add_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                       CUDA_NUM_THREADS,
                       0,
-                      stream>>>(input_grads[i].get_float_ptr(),
+                      stream>>>(input_grad.get_float_ptr(),
                                 output_grad.get_float_ptr() + offset,
-                                num_blocks,
-                                input_blk_sizes[i],
+                                blocks_to_add,
+                                input_blk_size,
                                 output_blk_size);
-    offset += input_blk_sizes[i];
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index dac55539d2..16db62a57f 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/conv_2d_kernels.h"
 
 namespace FlexFlow {
@@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu
index adf0cd8e89..c5fa56bc78 100644
--- a/lib/kernels/src/cuda/ops/dropout_kernels.cu
+++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/dropout_kernels.h"
 #include "kernels/ff_handle.h"
 
diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
index 44273a323f..3a4a77b3dd 100644
--- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/element_binary_kernels.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index 056c80ecf6..218e74b939 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/element_unary_kernels.h"
 #include "op-attrs/get_op_type.h"
@@ -290,10 +290,10 @@ struct BackwardKernel {
                   OperatorType op_type,
                   std::optional<float> scalar,
                   PerDeviceFFHandle const &handle,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &input_grad,
                   GenericTensorAccessorR const &output,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &input_grad) {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
     if (use_cudnn(op_type)) {
@@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(input.data_type,
                                       stream,
                                       device_state,
                                       get_op_type(attrs),
                                       attrs.scalar,
                                       handle,
-                                      input,
-                                      input_grad,
                                       output,
-                                      output_grad);
+                                      output_grad,
+                                      input,
+                                      input_grad);
 }
 
 } // namespace ElementUnary
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 973d05f596..594a183ff0 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/flat_kernels.h"
 
@@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr) {
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr) {
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 31c1bac217..19e495a540 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -13,14 +13,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
 #include "kernels/gather_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Gather {
+namespace FlexFlow::Kernels::Gather {
 
 template <typename IndexType>
 __global__ void gather_forward(float const *input,
@@ -125,11 +123,15 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
-
   coord_t stride =
-      output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
+      output.shape
+          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
           .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
 
@@ -157,9 +159,13 @@ void backward_kernel(ffStream_t stream,
 
   coord_t stride =
       output_grad.shape
-          .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .get_volume()
+          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
+          .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size =
       output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size =
@@ -180,6 +186,4 @@ void backward_kernel(ffStream_t stream,
       output_dim_size);
 }
 
-} // namespace Gather
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Gather
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index ca51f0d216..02bda55828 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/linear_kernels.h"
 #include "utils/integer_conversions.h"
@@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
 
 void forward_kernel(cudaStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *weight_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *weight_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size) {
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           weight_ptr,
+                           static_cast<void const *>(weight_ptr),
                            weight_type,
                            in_dim,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
                            &beta,
-                           output_ptr,
+                           static_cast<void *>(output_ptr),
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             bias_ptr,
+                             static_cast<void const *>(bias_ptr),
                              weight_type,
                              1,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             output_ptr,
+                             static_cast<void *>(output_ptr),
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      output_ptr,
+                                      static_cast<void *>(output_ptr),
                                       &beta,
                                       m.outputTensor,
-                                      output_ptr));
+                                      static_cast<void *>(output_ptr)));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_grad_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
@@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream,
   int output_size = out_dim * batch_size;
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
-      relu_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      relu_backward_kernel(m.output_type,
+                           static_cast<void *>(output_grad_ptr),
+                           static_cast<void const *>(output_ptr),
+                           output_size,
+                           stream);
     } else if (m.activation == Activation::SIGMOID) {
-      sigmoid_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      sigmoid_backward_kernel(m.output_type,
+                              static_cast<void *>(output_grad_ptr),
+                              static_cast<void const *>(output_ptr),
+                              output_size,
+                              stream);
     } else {
       // TODO: only support relu and sigmoid for now
       assert(false && "Unsupported activation for Linear");
@@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
-                           output_grad_ptr,
+                           static_cast<void *>(output_grad_ptr),
                            output_type,
                            out_dim,
                            &alpha,
-                           kernel_grad_ptr,
+                           static_cast<void *>(kernel_grad_ptr),
                            weight_type,
                            in_dim,
                            compute_type,
@@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream,
                               in_dim,
                               out_dim,
                               &alpha,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim,
                               &lambda,
-                              (float *)kernel_ptr,
+                              kernel_ptr,
                               in_dim,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim));
     } else {
       assert(false && "Only L2 regularization is supported");
@@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             bias_grad_ptr,
+                             static_cast<void *>(bias_grad_ptr),
                              weight_type,
                              1,
                              compute_type,
@@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             kernel_ptr,
+                             static_cast<void const *>(kernel_ptr),
                              weight_type,
                              in_dim,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             input_grad_ptr,
+                             static_cast<void *>(input_grad_ptr),
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 2831562f58..b8dfac5204 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/partition_kernels.h"
 
@@ -40,8 +40,8 @@ template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorW const &input_grad,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
     add_kernel<real_type_t<T>>
         <<<GET_BLOCKS(input_grad.shape.num_elements().unwrap_nonnegative()),
            CUDA_NUM_THREADS,
@@ -67,10 +67,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(
-      m.data_type, stream, m, input_grad, output_grad);
+      m.data_type, stream, m, output_grad, input_grad);
 }
 
 } // namespace Repartition
diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
index 51fa29d289..e8ea3f64c2 100644
--- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/pool_2d_kernels.h"
 
 namespace FlexFlow {
@@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr) {
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu
index 02a89da807..563bbae21d 100644
--- a/lib/kernels/src/cuda/ops/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/reduce_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 5d95a3766a..d9c09b082d 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reduction_kernels.h"
 
@@ -55,8 +55,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(input.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(output.data_type, stream, output, input);
 }
 
 } // namespace Reduction
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 4706f38fd4..4685fd7a2d 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/replicate_kernels.h"
 
@@ -22,8 +22,8 @@ namespace Kernels {
 namespace Replicate {
 
 template <typename T>
-__global__ void replicate_backward_kernel(T *input_ptr,
-                                          T const *output_ptr,
+__global__ void replicate_backward_kernel(T const *output_ptr,
+                                          T *input_ptr,
                                           size_t num_elements,
                                           size_t num_replicas) {
   CUDA_KERNEL_LOOP(i, num_elements) {
@@ -38,7 +38,6 @@ struct ForwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -51,15 +50,15 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input,
                   size_t num_replicas) {
     size_t total_elements =
         input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            input.get<T>(),
             output.get<T>(),
+            input.get<T>(),
             input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
@@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas) {
   DataTypeDispatch1<BackwardKernel>{}(
-      input.data_type, stream, input, output, num_replicas);
+      input.data_type, stream, output, input, num_replicas);
 }
 
 } // namespace Replicate
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index c5a289ce6b..a6a390b38e 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reshape_kernels.h"
 
@@ -43,8 +43,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
         <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
@@ -66,9 +66,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, output, input);
 }
 
 } // namespace Reshape
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 8391a499df..582aa02386 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -13,13 +13,11 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_params.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
@@ -27,23 +25,24 @@ __global__ void reverse_forward_kernel(float const *in_ptr,
                                        coord_t reverse_dim_size,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+    coord_t out_idx = i;
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
     i = i - blk_idx * (reverse_dim_size * in_blk_size);
     coord_t reverse_dim_idx = i / in_blk_size;
     i = i - reverse_dim_idx * in_blk_size;
     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
-    out_ptr[i] = in_ptr[in_idx];
+    out_ptr[out_idx] = in_ptr[in_idx];
   }
 }
 
-void forward_kernel(cudaStream_t stream,
-                    float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size) {
+static void forward_kernel_internal(cudaStream_t stream,
+                                    float const *in_ptr,
+                                    float *out_ptr,
+                                    coord_t num_out_blks,
+                                    coord_t reverse_dim_size,
+                                    coord_t in_blk_size,
+                                    coord_t output_size) {
 
   reverse_forward_kernel<<<GET_BLOCKS(output_size),
                            CUDA_NUM_THREADS,
@@ -52,13 +51,31 @@ void forward_kernel(cudaStream_t stream,
       in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size) {
+void forward_kernel(ffStream_t stream,
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &attrs) {
+
+  auto reverse_kernels_params =
+      compute_reverse_kernels_params(output_accessor.shape, attrs);
+
+  forward_kernel_internal(
+      stream,
+      input_accessor.get_float_ptr(),
+      output_accessor.get_float_ptr(),
+      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
+      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
+      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
+      reverse_kernels_params.out_size.unwrap_nonnegative());
+}
+
+void backward_kernel_internal(cudaStream_t stream,
+                              float const *out_grad_ptr,
+                              float *in_grad_ptr,
+                              coord_t num_out_blks,
+                              coord_t reverse_dim_size,
+                              coord_t in_blk_size,
+                              coord_t input_size) {
 
   reverse_forward_kernel<<<GET_BLOCKS(input_size),
                            CUDA_NUM_THREADS,
@@ -67,6 +84,21 @@ void backward_kernel(cudaStream_t stream,
       out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+void backward_kernel(ffStream_t stream,
+                     GenericTensorAccessorR const &output_grad_accessor,
+                     GenericTensorAccessorW &input_grad_accessor,
+                     ReverseAttrs const &attrs) {
+  auto reverse_kernels_params =
+      compute_reverse_kernels_params(input_grad_accessor.shape, attrs);
+
+  backward_kernel_internal(
+      stream,
+      output_grad_accessor.get_float_ptr(),
+      input_grad_accessor.get_float_ptr(),
+      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
+      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
+      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
+      reverse_kernels_params.out_size.unwrap_nonnegative());
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu
index 93ed85de18..da0ffd846e 100644
--- a/lib/kernels/src/cuda/ops/softmax_kernels.cu
+++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/softmax_kernels.h"
 
 namespace FlexFlow {
@@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements) {
 
   checkCUDA(cudaMemcpyAsync(input_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/split_kernels.cu b/lib/kernels/src/cuda/ops/split_kernels.cu
index f01393732d..5c8b305851 100644
--- a/lib/kernels/src/cuda/ops/split_kernels.cu
+++ b/lib/kernels/src/cuda/ops/split_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/split_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/topk_kernels.cu b/lib/kernels/src/cuda/ops/topk_kernels.cu
index c8f183172e..3824c57b32 100644
--- a/lib/kernels/src/cuda/ops/topk_kernels.cu
+++ b/lib/kernels/src/cuda/ops/topk_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/topk_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 60d2f7f342..91f3d48a35 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -13,10 +13,10 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
+#include "kernels/legion_ordered/transform.h"
 #include "kernels/transpose_kernels.h"
-#include "op-attrs/dim_ordered/transform.h"
 #include "utils/exception.h"
 #include "utils/nonnegative_int/num_elements.h"
 
@@ -100,8 +100,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposeAttrs const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad) {
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
   info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative();
diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernel.cu
deleted file mode 100644
index 439eed9dec..0000000000
--- a/lib/kernels/src/cuda/optimizer_kernel.cu
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernels/optimizer_kernels.h"
-
-namespace FlexFlow {
-
-__global__ void sgd_update(size_t count,
-                           float lr,
-                           float weight_decay,
-                           float momentum,
-                           bool nesterov,
-                           float const *WGrad,
-                           float *V,
-                           float *W) {
-  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
-  CUDA_KERNEL_LOOP(i, count) {
-    float gt = WGrad[i] + weight_decay * W[i];
-    if (momentum > 0.0f) {
-      V[i] = V[i] * momentum + gt;
-      if (nesterov) {
-        gt = gt + momentum * V[i];
-      } else {
-        gt = V[i];
-      }
-    }
-    W[i] -= lr * gt;
-  }
-}
-
-__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
-                                               float const *w_grad_ptr,
-                                               size_t size,
-                                               int num_replicas,
-                                               float *w_ptr,
-                                               float *v_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  // Step 1: Gather gradients in the first replica
-  for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
-    apply_add_with_scale<float>
-        <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-            (float *)w_grad_ptr, src, size, 1.0f);
-  }
-  // checkCUDA(cudaDeviceSynchronize());
-  //  Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-
-#ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
-                                                 PerDeviceOpState const *meta,
-                                                 float const *w_grad_ptr,
-                                                 size_t size,
-                                                 float *w_ptr,
-                                                 float *v_ptr) {
-  // Use NCCL to sync gradients
-  // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
-  // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
-  // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
-
-  // Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-#endif
-
-// ==================================================================
-//                        Adam Optimizer
-// ==================================================================
-__global__ void
-    add_kernel(int count, float scale, float const *src, float *dst) {
-  CUDA_KERNEL_LOOP(i, count) {
-    dst[i] += src[i] * scale;
-  }
-}
-
-__global__ void scale_kernel(int count, float a, float b, float *ptr) {
-  CUDA_KERNEL_LOOP(i, count) {
-    ptr[i] = (b - a) * ptr[i] + a;
-  }
-}
-
-__global__ void adam_update(int count,
-                            float alpha_t,
-                            float beta1,
-                            float beta2,
-                            float weight_decay,
-                            float epsilon,
-                            float const *WGrad,
-                            float *M,
-                            float *V,
-                            float *W) {
-  // Reference for weight decay
-  // https://www.fast.ai/2018/07/02/adam-weight-decay/
-  CUDA_KERNEL_LOOP(i, count) {
-    // W[i] -= weight_decay * alpha_t * W[i];
-    // float gt = WGrad[i];
-    float gt = WGrad[i] + weight_decay * W[i];
-    float mt = beta1 * M[i] + (1 - beta1) * gt;
-    float vt = beta2 * V[i] + (1 - beta2) * gt * gt;
-    M[i] = mt;
-    V[i] = vt;
-    W[i] -= alpha_t * mt / (sqrt(vt) + epsilon);
-  }
-}
-
-__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
-                                                float const *w_grad_ptr,
-                                                size_t size,
-                                                int num_replicas,
-                                                float *w_ptr,
-                                                float *v_ptr,
-                                                float *m_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  // Step 1: Gather gradients in the first replica
-  for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
-    add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
-  }
-  // checkCUDA(cudaDeviceSynchronize());
-  // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
-  //         op->alpha, op->alpha_t, op->weight_decay);
-  //  Step 2: Adam update
-  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-
-#ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
-                                                  PerDeviceOpState const *meta,
-                                                  float const *w_grad_ptr,
-                                                  size_t size,
-                                                  float *w_ptr,
-                                                  float *v_ptr,
-                                                  float *m_ptr) {
-  // Use NCCL to sync gradients
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
-  // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
-  //         op->alpha, op->alpha_t, op->weight_decay);
-  //  Step 2: Adam update
-  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-#endif
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
new file mode 100644
index 0000000000..fe817876ce
--- /dev/null
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -0,0 +1,205 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/device.h"
+#include "kernels/nccl.h"
+#include "kernels/optimizer_kernels.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           float const *WGrad,
+                           float *V,
+                           float *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    float gt = WGrad[i] + weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * momentum + gt;
+      if (nesterov) {
+        gt = gt + momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= lr * gt;
+  }
+}
+
+__host__ void sgd_ps_update_task_gpu(ffStream_t stream,
+                                     float lr,
+                                     float momentum,
+                                     bool nesterov,
+                                     float weight_decay,
+                                     float const *weight_grad_ptr,
+                                     size_t size,
+                                     int num_replicas,
+                                     float *weight_ptr,
+                                     float *sgd_v_ptr) {
+  // Step 1: Gather gradients in the first replica
+  for (int i = 1; i < num_replicas; i++) {
+    float const *src = weight_grad_ptr + i * size;
+    apply_add_with_scale<float>
+        <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+            (float *)weight_grad_ptr, src, size, 1.0f);
+  }
+
+  //  Step 2: SGD update
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
+}
+
+#ifdef FF_USE_NCCL
+__host__ void sgd_nccl_update_task_gpu(ffStream_t stream,
+                                       float lr,
+                                       float momentum,
+                                       bool nesterov,
+                                       float weight_decay,
+                                       PerDeviceFFHandle const &handle,
+                                       float const *w_grad_ptr,
+                                       size_t size,
+                                       float *w_ptr,
+                                       float *v_ptr) {
+  // Step 1: Use NCCL to sync gradients
+  ncclComm_t comm = handle.ncclComm;
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
+
+  //  Step 2: SGD update
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+      size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr);
+}
+#endif
+
+// ==================================================================
+//                        Adam Optimizer
+// ==================================================================
+__global__ void
+    add_kernel(int count, float scale, float const *src, float *dst) {
+  CUDA_KERNEL_LOOP(i, count) {
+    dst[i] += src[i] * scale;
+  }
+}
+
+__global__ void scale_kernel(int count, float a, float b, float *ptr) {
+  CUDA_KERNEL_LOOP(i, count) {
+    ptr[i] = (b - a) * ptr[i] + a;
+  }
+}
+
+__global__ void adam_update(int count,
+                            float alpha_t,
+                            float beta1,
+                            float beta2,
+                            float weight_decay,
+                            float epsilon,
+                            float const *WGrad,
+                            float *M,
+                            float *V,
+                            float *W) {
+  // Reference for weight decay
+  // https://www.fast.ai/2018/07/02/adam-weight-decay/
+  CUDA_KERNEL_LOOP(i, count) {
+    // W[i] -= weight_decay * alpha_t * W[i];
+    // float gt = WGrad[i];
+    float gt = WGrad[i] + weight_decay * W[i];
+    float mt = beta1 * M[i] + (1 - beta1) * gt;
+    float vt = beta2 * V[i] + (1 - beta2) * gt * gt;
+    M[i] = mt;
+    V[i] = vt;
+    W[i] -= alpha_t * mt / (sqrt(vt) + epsilon);
+  }
+}
+
+__host__ void adam_ps_update_task_gpu(ffStream_t stream,
+                                      float alpha_t,
+                                      float beta1,
+                                      float beta2,
+                                      float weight_decay,
+                                      float epsilon,
+                                      float const *w_grad_ptr,
+                                      size_t size,
+                                      int num_replicas,
+                                      float *w_ptr,
+                                      float *v_ptr,
+                                      float *m_ptr) {
+  // Step 1: Gather gradients in the first replica
+  for (int i = 1; i < num_replicas; i++) {
+    float const *src = w_grad_ptr + i * size;
+    add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+        (float *)w_grad_ptr, src, size);
+  }
+
+  //  Step 2: Adam update
+  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                 alpha_t,
+                                                                 beta1,
+                                                                 beta2,
+                                                                 weight_decay,
+                                                                 epsilon,
+                                                                 w_grad_ptr,
+                                                                 m_ptr,
+                                                                 v_ptr,
+                                                                 w_ptr);
+}
+
+#ifdef FF_USE_NCCL
+__host__ void nccl_update_task_gpu(ffStream_t stream,
+                                   float alpha_t,
+                                   float beta1,
+                                   float beta2,
+                                   float weight_decay,
+                                   float epsilon,
+                                   PerDeviceFFHandle const &handle,
+                                   float const *w_grad_ptr,
+                                   size_t size,
+                                   float *w_ptr,
+                                   float *v_ptr,
+                                   float *m_ptr) {
+  // Step 1: Use NCCL to sync gradients
+  checkNCCL(ncclAllReduce(w_grad_ptr,
+                          (float *)w_grad_ptr,
+                          size,
+                          ncclFloat,
+                          ncclSum,
+                          handle.ncclComm,
+                          stream));
+
+  //  Step 2: Adam update
+  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                 alpha_t,
+                                                                 beta1,
+                                                                 beta2,
+                                                                 weight_decay,
+                                                                 epsilon,
+                                                                 w_grad_ptr,
+                                                                 m_ptr,
+                                                                 v_ptr,
+                                                                 w_ptr);
+}
+#endif
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp
index 7ca3149f2f..aefe53cc46 100644
--- a/lib/kernels/src/hip/embedding_kernels.cpp
+++ b/lib/kernels/src/hip/embedding_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/embedding_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
@@ -364,8 +364,8 @@ struct ForwardKernel {
            weight.data_type == DataType::FLOAT ||
            weight.data_type == DataType::DOUBLE);
 
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -374,10 +374,11 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -386,9 +387,7 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
@@ -408,8 +407,9 @@ struct BackwardKernel {
     assert(output.data_type == DataType::HALF ||
            output.data_type == DataType::FLOAT ||
            output.data_type == DataType::DOUBLE);
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
+
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -418,9 +418,11 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -429,9 +431,7 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
diff --git a/lib/kernels/src/hip/loss_function_kernels.cpp b/lib/kernels/src/hip/loss_function_kernels.cpp
index e82b5c96d5..05068f1bd0 100644
--- a/lib/kernels/src/hip/loss_function_kernels.cpp
+++ b/lib/kernels/src/hip/loss_function_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/loss_function_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/attention_kernels.cpp b/lib/kernels/src/hip/ops/attention_kernels.cpp
index 005cef30d1..b374ead305 100644
--- a/lib/kernels/src/hip/ops/attention_kernels.cpp
+++ b/lib/kernels/src/hip/ops/attention_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/attention_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
index c4b3be823f..6d9ae8a268 100644
--- a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
+++ b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/batch_matmul_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
index 8e94b462cd..764a3e0b58 100644
--- a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
+++ b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/batch_norm_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/cast_kernels.cpp b/lib/kernels/src/hip/ops/cast_kernels.cpp
index fa0c37ffa1..1035657c04 100644
--- a/lib/kernels/src/hip/ops/cast_kernels.cpp
+++ b/lib/kernels/src/hip/ops/cast_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/cast_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/combine_kernels.cpp b/lib/kernels/src/hip/ops/combine_kernels.cpp
index aa01f02276..f1e0422747 100644
--- a/lib/kernels/src/hip/ops/combine_kernels.cpp
+++ b/lib/kernels/src/hip/ops/combine_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/combine_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp
index aa38be739b..a215d67942 100644
--- a/lib/kernels/src/hip/ops/concat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/concat_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/concat_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <cassert>
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/conv_2d_kernels.h b/lib/kernels/src/hip/ops/conv_2d_kernels.h
index bcf015d561..76a73ab08c 100644
--- a/lib/kernels/src/hip/ops/conv_2d_kernels.h
+++ b/lib/kernels/src/hip/ops/conv_2d_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H
 #define _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 namespace Kernels {
diff --git a/lib/kernels/src/hip/ops/dropout_kernels.cpp b/lib/kernels/src/hip/ops/dropout_kernels.cpp
index baaf8e6902..d85c0ae054 100644
--- a/lib/kernels/src/hip/ops/dropout_kernels.cpp
+++ b/lib/kernels/src/hip/ops/dropout_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/dropout_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/ff_handle.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/element_binary_kernels.cpp b/lib/kernels/src/hip/ops/element_binary_kernels.cpp
index bc66bbff2f..9e0452b09b 100644
--- a/lib/kernels/src/hip/ops/element_binary_kernels.cpp
+++ b/lib/kernels/src/hip/ops/element_binary_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/element_binary_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.dtg.h"
diff --git a/lib/kernels/src/hip/ops/element_unary_kernels.cpp b/lib/kernels/src/hip/ops/element_unary_kernels.cpp
index f4b0ccb82d..163f13a6da 100644
--- a/lib/kernels/src/hip/ops/element_unary_kernels.cpp
+++ b/lib/kernels/src/hip/ops/element_unary_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/element_unary_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "op-attrs/get_op_type.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/flat_kernels.cpp b/lib/kernels/src/hip/ops/flat_kernels.cpp
index 763fb9e322..dedfb4b9a9 100644
--- a/lib/kernels/src/hip/ops/flat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/flat_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/flat_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/gather_kernels.cpp b/lib/kernels/src/hip/ops/gather_kernels.cpp
index 17c0014e98..6e9e4c6a2c 100644
--- a/lib/kernels/src/hip/ops/gather_kernels.cpp
+++ b/lib/kernels/src/hip/ops/gather_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/gather_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/partition_kernels.cpp b/lib/kernels/src/hip/ops/partition_kernels.cpp
index 4591247faa..26748a7e45 100644
--- a/lib/kernels/src/hip/ops/partition_kernels.cpp
+++ b/lib/kernels/src/hip/ops/partition_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/partition_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
index ed942c105c..7e5ae2ab80 100644
--- a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
+++ b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/pool_2d_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/reduce_kernels.cpp b/lib/kernels/src/hip/ops/reduce_kernels.cpp
index 468543dd5b..c0bcc84d48 100644
--- a/lib/kernels/src/hip/ops/reduce_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reduce_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reduce_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/replicate_kernels.cpp b/lib/kernels/src/hip/ops/replicate_kernels.cpp
index 8d27bb1908..ee7bf701c0 100644
--- a/lib/kernels/src/hip/ops/replicate_kernels.cpp
+++ b/lib/kernels/src/hip/ops/replicate_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/replicate_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/reshape_kernels.cpp b/lib/kernels/src/hip/ops/reshape_kernels.cpp
index 47978a5f4a..810b929e24 100644
--- a/lib/kernels/src/hip/ops/reshape_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reshape_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reshape_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/reverse_kernels.cpp b/lib/kernels/src/hip/ops/reverse_kernels.cpp
index 03e97245bf..a56ff3540a 100644
--- a/lib/kernels/src/hip/ops/reverse_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reverse_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reverse_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/softmax_kernels.cpp b/lib/kernels/src/hip/ops/softmax_kernels.cpp
index 3a8f2813b7..610675850b 100644
--- a/lib/kernels/src/hip/ops/softmax_kernels.cpp
+++ b/lib/kernels/src/hip/ops/softmax_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/softmax_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/split_kernels.cpp b/lib/kernels/src/hip/ops/split_kernels.cpp
index 5599ae6d6f..3034b633a6 100644
--- a/lib/kernels/src/hip/ops/split_kernels.cpp
+++ b/lib/kernels/src/hip/ops/split_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/split_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/topk_kernels.cpp b/lib/kernels/src/hip/ops/topk_kernels.cpp
index f085c5831f..777d9edffa 100644
--- a/lib/kernels/src/hip/ops/topk_kernels.cpp
+++ b/lib/kernels/src/hip/ops/topk_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/topk_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/transpose_kernels.cpp b/lib/kernels/src/hip/ops/transpose_kernels.cpp
index ef9dd58c63..c5122f34bf 100644
--- a/lib/kernels/src/hip/ops/transpose_kernels.cpp
+++ b/lib/kernels/src/hip/ops/transpose_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/transpose_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "utils/exception.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/internal/device.cc
similarity index 97%
rename from lib/kernels/src/device.cc
rename to lib/kernels/src/internal/device.cc
index f46099c79a..eb3d229c2a 100644
--- a/lib/kernels/src/device.cc
+++ b/lib/kernels/src/internal/device.cc
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/internal/device.h
similarity index 98%
rename from lib/kernels/src/device.h
rename to lib/kernels/src/internal/device.h
index ceff2f92ff..226c7ad174 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/internal/device.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_KERNELS_SRC_DEVICE_H
-#define _FLEXFLOW_KERNELS_SRC_DEVICE_H
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
 
 #include "kernels/array_shape.h"
 #include "kernels/device.h"
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
new file mode 100644
index 0000000000..b5042f77a0
--- /dev/null
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -0,0 +1,249 @@
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/datatype_dispatch.h"
+#include "utils/containers/reversed.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+nonnegative_int
+    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &indices,
+                              ArrayShape const &shape) {
+  ASSERT(indices.size() == shape.num_dims(),
+         "Number of indices does not match the number of dimensions");
+
+  nonnegative_int offset = 0_n;
+  nonnegative_int multiplier = 1_n;
+
+  for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) {
+    ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}),
+           "Out of bounds access",
+           dim);
+
+    offset += indices.at(dim) * multiplier;
+    multiplier *= shape.at(legion_dim_t{dim});
+  }
+
+  return offset;
+}
+
+void copy_accessor_data_to_l_from_r(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes =
+      dst_accessor.shape.get_volume().unwrap_nonnegative() *
+      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(dst_device_type == DeviceType::GPU);
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
+GenericTensorAccessorW::operator GenericTensorAccessorR() const {
+  return read_only_accessor_from_write_accessor(*this);
+}
+
+GenericTensorAccessorW::GenericTensorAccessorW(
+    DataType data_type,
+    ArrayShape const &shape,
+    void *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void *const &,
+           DeviceType const &>
+    GenericTensorAccessorW::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorW::operator==(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorW::operator!=(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() != other.tie();
+}
+
+int32_t *GenericTensorAccessorW::get_int32_ptr() const {
+  return this->get<DataType::INT32>();
+}
+
+int64_t *GenericTensorAccessorW::get_int64_ptr() const {
+  return this->get<DataType::INT64>();
+}
+
+float *GenericTensorAccessorW::get_float_ptr() const {
+  return this->get<DataType::FLOAT>();
+}
+
+double *GenericTensorAccessorW::get_double_ptr() const {
+  return this->get<DataType::DOUBLE>();
+}
+
+half *GenericTensorAccessorW::get_half_ptr() const {
+  return this->get<DataType::HALF>();
+}
+
+std::string format_as(GenericTensorAccessorW const &a) {
+  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
+                     a.data_type,
+                     a.shape,
+                     a.ptr);
+}
+
+std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
+  return (s << fmt::to_string(a));
+}
+
+GenericTensorAccessorR::GenericTensorAccessorR(
+    DataType data_type,
+    ArrayShape const &shape,
+    void const *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void const *const &,
+           DeviceType const &>
+    GenericTensorAccessorR::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorR::operator==(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorR::operator!=(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() != other.tie();
+}
+
+int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
+  return this->get<DataType::INT32>();
+}
+
+int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
+  return this->get<DataType::INT64>();
+}
+
+float const *GenericTensorAccessorR::get_float_ptr() const {
+  return this->get<DataType::FLOAT>();
+}
+
+double const *GenericTensorAccessorR::get_double_ptr() const {
+  return this->get<DataType::DOUBLE>();
+}
+
+half const *GenericTensorAccessorR::get_half_ptr() const {
+  return get<DataType::HALF>();
+}
+
+std::string format_as(GenericTensorAccessorR const &a) {
+  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
+                     a.data_type,
+                     a.shape,
+                     a.ptr);
+}
+
+std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
+  return (s << fmt::to_string(a));
+}
+
+int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::INT32>(a);
+}
+
+int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::INT64>(a);
+}
+
+float const *get_float_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+double const *get_double_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+half const *get_half_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::HALF>(a);
+}
+
+std::vector<int32_t const *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::INT32>(a);
+}
+
+std::vector<int64_t const *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::INT64>(a);
+}
+
+std::vector<float const *>
+    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+std::vector<double const *>
+    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+std::vector<half const *>
+    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::HALF>(a);
+}
+
+GenericTensorAccessorR read_only_accessor_from_write_accessor(
+    GenericTensorAccessorW const &writable) {
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.device_type};
+}
+
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2) {
+  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
+}
+
+bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
+                             ArrayShape const &expected_shape,
+                             DataType const &expected_dtype) {
+  return accessor.shape == expected_shape &&
+         accessor.data_type == expected_dtype;
+}
+
+std::pair<ArrayShape, DataType>
+    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
+  return std::make_pair(accessor.shape, accessor.data_type);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc
new file mode 100644
index 0000000000..b9f253bcff
--- /dev/null
+++ b/lib/kernels/src/kernels/allocation.cc
@@ -0,0 +1,38 @@
+#include "kernels/allocation.h"
+#include "op-attrs/tensor_shape.h"
+
+namespace FlexFlow {
+
+void *Allocator::allocate(size_t mem_size) {
+  return this->i_allocator->allocate(mem_size);
+}
+
+void Allocator::deallocate(void *ptr) {
+  this->i_allocator->deallocate(ptr);
+}
+
+DeviceType Allocator::get_allocation_device_type() const {
+  return this->i_allocator->get_allocation_device_type();
+}
+
+GenericTensorAccessorW
+    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
+  void *ptr =
+      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
+  return GenericTensorAccessorW{
+      tensor_shape.data_type,
+      array_shape_from_tensor_shape(tensor_shape),
+      ptr,
+      this->get_allocation_device_type(),
+  };
+}
+
+void Allocator::deallocate_tensor(GenericTensorAccessorW const &t) {
+  this->deallocate(t.ptr);
+}
+
+void Allocator::deallocate_tensor(GenericTensorAccessorR const &t) {
+  this->deallocate(const_cast<void *>(t.ptr));
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc
similarity index 51%
rename from lib/kernels/src/array_shape.cc
rename to lib/kernels/src/kernels/array_shape.cc
index 243185ada4..34a53c1bb3 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/kernels/array_shape.cc
@@ -1,23 +1,20 @@
 #include "kernels/array_shape.h"
+#include "kernels/legion_ordered/slice.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "utils/containers/cartesian_product.h"
 #include "utils/containers/product.h"
 #include "utils/containers/reversed.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
 #include "utils/containers/vector_of.h"
+#include "utils/hash/tuple.h"
+#include "utils/hash/vector.h"
 #include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-static LegionOrdered<nonnegative_int>
-    legion_dims_from_ff_dims(FFOrdered<nonnegative_int> const &ff_ordered) {
-  return LegionOrdered<nonnegative_int>{reversed(vector_of(ff_ordered))};
-}
-
-ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims)
-    : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {}
-
-ArrayShape::ArrayShape(TensorShape const &shape)
-    : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {}
-
-ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
+ArrayShape::ArrayShape(LegionOrdered<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
 nonnegative_int ArrayShape::get_volume() const {
@@ -59,10 +56,19 @@ bool ArrayShape::operator!=(ArrayShape const &other) const {
   return this->tie() != other.tie();
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-  NOT_IMPLEMENTED();
+ArrayShape
+    ArrayShape::sub_shape(ff_dim_t const &start,
+                          std::optional<ff_dim_t> const &maybe_end) const {
+  FFOrdered<nonnegative_int> ff_ordered_dims =
+      ff_ordered_from_legion_ordered(this->dims);
+  FFOrdered<nonnegative_int> sliced = slice(ff_ordered_dims, start, maybe_end);
+  return ArrayShape{legion_ordered_from_ff_ordered(sliced)};
+}
+
+ArrayShape
+    ArrayShape::sub_shape(legion_dim_t const &start,
+                          std::optional<legion_dim_t> const &maybe_end) const {
+  return ArrayShape{slice(this->dims, start, maybe_end)};
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
@@ -81,15 +87,6 @@ std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
   return std::tie(this->dims);
 }
 
-nonnegative_int get_volume(ArrayShape const &shape) {
-  return shape.get_volume();
-}
-
-TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
-  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
-                     dtype};
-}
-
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";
@@ -102,4 +99,44 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) {
   return (s << fmt::to_string(x));
 }
 
+nonnegative_int get_volume(ArrayShape const &shape) {
+  return shape.get_volume();
+}
+
+ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) {
+  return ArrayShape{
+      legion_ordered_from_ff_ordered(tensor_shape.dims.ff_ordered)};
+}
+
+TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
+  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
+                     dtype};
+}
+
+std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
+  std::vector<std::vector<nonnegative_int>> per_dim_ranges =
+      transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)),
+                [](nonnegative_int dim_size) -> std::vector<nonnegative_int> {
+                  return nonnegative_range(dim_size);
+                });
+
+  std::unordered_set<std::vector<nonnegative_int>> raw_points =
+      unordered_set_of(cartesian_product(per_dim_ranges));
+
+  return transform(raw_points,
+                   [](std::vector<nonnegative_int> const &raw_point) {
+                     return ArrayCoord{ff_ordered_of(raw_point)};
+                   });
+}
+
 } // namespace FlexFlow
+
+namespace std {
+
+using namespace FlexFlow;
+
+size_t hash<ArrayShape>::operator()(ArrayShape const &s) const {
+  return get_std_hash(s.tie());
+}
+
+} // namespace std
diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc
new file mode 100644
index 0000000000..d8619d8ce6
--- /dev/null
+++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc
@@ -0,0 +1,66 @@
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CopyTensorAccessorW {
+  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorW>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+template <DataType DT>
+struct CopyTensorAccessorR {
+  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return read_only_accessor_from_write_accessor(dst_accessor);
+  }
+};
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorR>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary(
+    GenericTensorAccessorR const &accessor, Allocator &cpu_allocator) {
+  if (accessor.device_type == DeviceType::GPU) {
+    return copy_tensor_accessor_r(accessor, cpu_allocator);
+  } else {
+    return accessor;
+  }
+}
+
+GenericTensorAccessorW copy_tensor_accessor_w_to_cpu_if_necessary(
+    GenericTensorAccessorW const &accessor, Allocator &cpu_allocator) {
+  if (accessor.device_type == DeviceType::GPU) {
+    return copy_tensor_accessor_w(accessor, cpu_allocator);
+  } else {
+    return accessor;
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
new file mode 100644
index 0000000000..1b8ab35d89
--- /dev/null
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -0,0 +1,184 @@
+#include "kernels/format_accessor_contents.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/local_cpu_allocator.h"
+#include "utils/indent.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct Print1DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 1_n);
+
+    nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n});
+
+    stream << "["
+           << join_strings(nonnegative_range(ncols),
+                           " ",
+                           [&](nonnegative_int col_idx) -> std::string {
+                             return fmt::to_string(
+                                 accessor.at<DT>(FFOrdered{col_idx}));
+                           })
+           << "]";
+  }
+};
+
+static std::string
+    format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 1_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print1DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print2DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 2_n);
+    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[" +
+             join_strings(nonnegative_range(dim1_size),
+                          " ",
+                          [&](nonnegative_int dim1_idx) -> std::string {
+                            return fmt::to_string(
+                                accessor.at<DT>(FFOrdered{dim0_idx, dim1_idx}));
+                          }) +
+             "]";
+    };
+
+    stream << "[\n"
+           << indent(
+                  join_strings(nonnegative_range(dim0_size), "\n", render_1d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 2_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print2DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print3DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 3_n);
+
+    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+    nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx,
+                         nonnegative_int dim1_idx) -> std::string {
+      return "[" +
+             join_strings(nonnegative_range(dim2_size),
+                          " ",
+                          [&](nonnegative_int dim2_idx) -> std::string {
+                            return fmt::to_string(accessor.at<DT>(
+                                FFOrdered{dim0_idx, dim1_idx, dim2_idx}));
+                          }) +
+             "]";
+    };
+
+    auto render_2d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[\n" +
+             indent(join_strings(nonnegative_range(dim1_size),
+                                 "\n",
+                                 [&](nonnegative_int dim1_idx) -> std::string {
+                                   return render_1d(dim0_idx, dim1_idx);
+                                 })) +
+             "\n]";
+    };
+
+    stream << "[\n"
+           << indent(
+                  join_strings(nonnegative_range(dim0_size), "\n", render_2d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 3_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print3DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+static std::string
+    format_1d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_1d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+static std::string
+    format_2d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_2d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+static std::string
+    format_3d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_3d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+
+  int num_dims = accessor.shape.num_dims().unwrap_nonnegative();
+  switch (num_dims) {
+    case 1:
+      return format_1d_accessor_r_contents(accessor);
+    case 2:
+      return format_2d_accessor_r_contents(accessor);
+    case 3:
+      return format_3d_accessor_r_contents(accessor);
+    default:
+      PANIC("Unhandled accessor dimensionality", num_dims);
+  }
+}
+
+std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor =
+      copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator);
+
+  int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative();
+  switch (num_dims) {
+    case 1:
+      return format_1d_accessor_w_contents(cpu_accessor);
+    case 2:
+      return format_2d_accessor_w_contents(cpu_accessor);
+    case 3:
+      return format_3d_accessor_w_contents(cpu_accessor);
+    default:
+      PANIC("Unhandled accessor dimensionality", num_dims);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc
similarity index 78%
rename from lib/kernels/src/legion_dim.cc
rename to lib/kernels/src/kernels/legion_dim.cc
index bbb15c5636..f3482b1d9b 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/kernels/legion_dim.cc
@@ -1,7 +1,11 @@
 #include "kernels/legion_dim.h"
+#include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
+using T = value_type<0>;
+template std::set<legion_dim_t> key_range(LegionOrdered<T> const &);
+
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
   return legion_dim_t{
       nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}};
@@ -11,6 +15,7 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                     nonnegative_int num_dimensions) {
   return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
                                       ff_dim.value.unwrap_nonnegative() - 1}};
+  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
new file mode 100644
index 0000000000..8af44173b0
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
@@ -0,0 +1,10 @@
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct LegionOrdered<T>;
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/slice.cc b/lib/kernels/src/kernels/legion_ordered/slice.cc
new file mode 100644
index 0000000000..69fcf570aa
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/slice.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template LegionOrdered<T> slice(LegionOrdered<T> const &,
+                                legion_dim_t const &,
+                                std::optional<legion_dim_t> const &);
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/transform.cc b/lib/kernels/src/kernels/legion_ordered/transform.cc
new file mode 100644
index 0000000000..d9fb38198e
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/transform.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/transform.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<Out(T const &)>;
+
+template LegionOrdered<Out> transform(LegionOrdered<T> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/kernels/src/kernels/local_cpu_allocator.cc
similarity index 52%
rename from lib/local-execution/src/local_cpu_allocator.cc
rename to lib/kernels/src/kernels/local_cpu_allocator.cc
index 4ca5f987a8..738d1abf27 100644
--- a/lib/local-execution/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/kernels/local_cpu_allocator.cc
@@ -1,20 +1,27 @@
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/device.h"
 #include "utils/containers/contains_key.h"
+#include <libassert/assert.hpp>
+#include <stdlib.h>
 
 namespace FlexFlow {
 void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
   void *ptr = malloc(requested_memory_size);
+  ASSERT(ptr != nullptr);
   this->ptrs.insert({ptr, std::unique_ptr<void, decltype(&free)>(ptr, free)});
   return ptr;
 }
 
 void LocalCPUAllocator::deallocate(void *ptr) {
-  if (contains_key(this->ptrs, ptr)) {
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
+  ASSERT(contains_key(this->ptrs, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  free(ptr);
+  this->ptrs.erase(ptr);
+}
+
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
 }
 
 Allocator create_local_cpu_memory_allocator() {
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/kernels/local_cuda_allocator.cc
similarity index 59%
rename from lib/kernels/src/local_cuda_allocator.cc
rename to lib/kernels/src/kernels/local_cuda_allocator.cc
index cdcfb017a0..1b081517bf 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/kernels/local_cuda_allocator.cc
@@ -1,6 +1,7 @@
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/device.h"
 #include "utils/containers/contains.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
@@ -11,13 +12,15 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
 }
 
 void LocalCudaAllocator::deallocate(void *ptr) {
-  if (contains(this->ptrs, ptr)) {
-    checkCUDA(cudaFree(ptr));
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
+  ASSERT(contains(this->ptrs, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  checkCUDA(cudaFree(ptr));
+  this->ptrs.erase(ptr);
+}
+
+DeviceType LocalCudaAllocator::get_allocation_device_type() const {
+  return DeviceType::GPU;
 }
 
 LocalCudaAllocator::~LocalCudaAllocator() {
@@ -27,7 +30,8 @@ LocalCudaAllocator::~LocalCudaAllocator() {
 }
 
 Allocator create_local_cuda_memory_allocator() {
-  return Allocator::create<LocalCudaAllocator>();
+  Allocator allocator = Allocator::create<LocalCudaAllocator>();
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc
new file mode 100644
index 0000000000..c647181872
--- /dev/null
+++ b/lib/kernels/src/kernels/reverse_kernels_params.cc
@@ -0,0 +1,30 @@
+#include "kernels/reverse_kernels_params.h"
+
+namespace FlexFlow {
+
+ReverseKernelsParams
+    compute_reverse_kernels_params(ArrayShape const &output_shape,
+                                   ReverseAttrs const &attrs) {
+  auto axis = attrs.axis;
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) {
+    if (i < axis.value) {
+      in_blk_size *= output_shape.at(ff_dim_t{i});
+    } else if (i == axis.value) {
+      reverse_dim_size = output_shape.at(ff_dim_t{i});
+    } else {
+      num_out_blks *= output_shape.at(ff_dim_t{i});
+    }
+  }
+
+  return ReverseKernelsParams{
+      num_out_blks,
+      reverse_dim_size,
+      in_blk_size,
+      output_shape.get_volume(),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc
index 7385b6cc3e..f0348aa91c 100644
--- a/lib/kernels/src/managed_ff_stream.cc
+++ b/lib/kernels/src/managed_ff_stream.cc
@@ -1,28 +1,36 @@
 #include "kernels/managed_ff_stream.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) {
-  checkCUDA(cudaStreamCreate(stream));
+  checkCUDA(cudaStreamCreate(this->stream));
 }
 
 ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept
     : stream(std::exchange(other.stream, nullptr)) {}
 
 ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept {
-  std::swap(this->stream, other.stream);
+  if (this != &other) {
+    this->cleanup();
+    this->stream = std::exchange(other.stream, nullptr);
+  }
   return *this;
 }
 
 ManagedFFStream::~ManagedFFStream() {
-  if (stream != nullptr) {
-    checkCUDA(cudaStreamDestroy(*stream));
-    delete stream;
+  this->cleanup();
+}
+
+void ManagedFFStream::cleanup() {
+  if (this->stream != nullptr) {
+    checkCUDA(cudaStreamDestroy(*this->stream));
+    delete this->stream;
   }
 }
 
 ffStream_t const &ManagedFFStream::raw_stream() const {
-  return *stream;
+  return *this->stream;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..ea26d2350c 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -1,16 +1,17 @@
 #include "kernels/managed_per_device_ff_handle.h"
-#include "device.h"
+#include "internal/device.h"
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
-  handle = new PerDeviceFFHandle;
-  handle->workSpaceSize = 1024 * 1024;
-  handle->allowTensorOpMathConversion = true;
-
-  checkCUDNN(cudnnCreate(&handle->dnn));
-  checkCUBLAS(cublasCreate(&handle->blas));
-  checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
+    size_t workSpaceSize, bool allowTensorOpMathConversion) {
+  this->handle = new PerDeviceFFHandle{};
+  this->handle->workSpaceSize = workSpaceSize;
+  this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
+
+  checkCUDNN(cudnnCreate(&this->handle->dnn));
+  checkCUBLAS(cublasCreate(&this->handle->blas));
+  checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize));
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
 
 ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=(
     ManagedPerDeviceFFHandle &&other) noexcept {
-  std::swap(this->handle, other.handle);
+  if (this != &other) {
+    this->cleanup();
+    this->handle = std::exchange(other.handle, nullptr);
+  }
   return *this;
 }
 
 ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
-  if (handle != nullptr) {
-    checkCUDNN(cudnnDestroy(handle->dnn));
-    checkCUBLAS(cublasDestroy(handle->blas));
-    checkCUDA(cudaFree(handle->workSpace));
-    delete handle;
+  this->cleanup();
+}
+
+void ManagedPerDeviceFFHandle::cleanup() {
+  if (this->handle != nullptr) {
+    checkCUDNN(cudnnDestroy(this->handle->dnn));
+    checkCUBLAS(cublasDestroy(this->handle->blas));
+    checkCUDA(cudaFree(this->handle->workSpace));
+    delete this->handle;
   }
 }
 
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 00da2d0d70..066cb96753 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -14,6 +14,7 @@ ff_add_test_executable(
     cudnn
     cudart
     cublas
+    pcg
 )
 
 set(FF_TEST_EXEC_NAME "kernels-tests")
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
new file mode 100644
index 0000000000..8630dcd8cd
--- /dev/null
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -0,0 +1,57 @@
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/replicate_kernels_cpu.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Replicate::cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input =
+        create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator);
+
+    TensorShape result_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    GenericTensorAccessorW result =
+        create_zero_filled_accessor_w(result_shape, cpu_allocator);
+
+    GenericTensorAccessorR correct = input;
+
+    Kernels::Replicate::cpu_forward_kernel(input, result);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  "result=",
+                  format_accessor_w_contents(result));
+  }
+
+  TEST_CASE("Replicate::cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR output = create_2d_accessor_r_with_contents(
+        {
+            {1, 2, 3},
+            {4, 3, 3},
+            {1, 3, 5},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+        {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+
+    TensorShape result_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    GenericTensorAccessorW result =
+        create_zero_filled_accessor_w(result_shape, cpu_allocator);
+    Kernels::Replicate::cpu_backward_kernel(output, result, 3);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+}
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
new file mode 100644
index 0000000000..db0016cb0b
--- /dev/null
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -0,0 +1,206 @@
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Reverse::cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+        {
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    SUBCASE("axis = ff_dim_t{0}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{0_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {3, 3, 6},
+                  {2, 1, 5},
+              },
+              {
+                  {1, 3, 2},
+                  {4, 2, 1},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{1}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{1_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {4, 2, 1},
+                  {1, 3, 2},
+              },
+              {
+                  {2, 1, 5},
+                  {3, 3, 6},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{2}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{2_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {2, 3, 1},
+                  {1, 2, 4},
+              },
+              {
+                  {6, 3, 3},
+                  {5, 1, 2},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+  }
+
+  TEST_CASE("Reverse::cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+        {
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    SUBCASE("axis = ff_dim_t{0}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{0_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {3, 3, 6},
+                  {2, 1, 5},
+              },
+              {
+                  {1, 3, 2},
+                  {4, 2, 1},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{1}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{1_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {4, 2, 1},
+                  {1, 3, 2},
+              },
+              {
+                  {2, 1, 5},
+                  {3, 3, 6},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{2}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{2_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {2, 3, 1},
+                  {1, 2, 4},
+              },
+              {
+                  {6, 3, 3},
+                  {5, 1, 2},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
new file mode 100644
index 0000000000..0f34a6aa06
--- /dev/null
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -0,0 +1,392 @@
+#include "internal/test_utils.h"
+#include "op-attrs/tensor_shape.h"
+#include "utils/containers/require_all_same1.h"
+#include "utils/join_strings.h"
+#include <random>
+
+namespace FlexFlow {
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
+  fill_with_zeros(result_accessor);
+  return result_accessor;
+}
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_zero_filled_accessor_w(shape, allocator);
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator) {
+  nonnegative_int ncols = num_elements(contents);
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{ncols}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+    cpu_accessor.at<DataType::FLOAT>(FFOrdered{col_idx}) =
+        contents.at(col_idx.unwrap_nonnegative());
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
+  nonnegative_int nrows = num_elements(contents);
+  ASSERT(nrows > 0);
+
+  nonnegative_int ncols = throw_if_unexpected(
+      require_all_same1(transform(contents, [](std::vector<float> const &row) {
+        return num_elements(row);
+      })));
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{nrows, ncols}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int row_idx : nonnegative_range(nrows)) {
+    for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+      cpu_accessor.at<DataType::FLOAT>(FFOrdered{row_idx, col_idx}) =
+          contents.at(row_idx.unwrap_nonnegative())
+              .at(col_idx.unwrap_nonnegative());
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<float>> const &m) {
+        return num_elements(m);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<float>> const &m) {
+        return throw_if_unexpected(
+            require_all_same1(transform(m, [](std::vector<float> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        cpu_accessor.at<DataType::FLOAT>(
+            FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
+            contents.at(dim0_idx.unwrap_nonnegative())
+                .at(dim1_idx.unwrap_nonnegative())
+                .at(dim2_idx.unwrap_nonnegative());
+      }
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
+        return num_elements(t);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &m) {
+        return throw_if_unexpected(require_all_same1(
+            transform(m, [](std::vector<std::vector<float>> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
+        return throw_if_unexpected(require_all_same1(
+            transform(t, [](std::vector<std::vector<float>> const &mat) {
+              return throw_if_unexpected(require_all_same1(
+                  transform(mat, [](std::vector<float> const &vec) {
+                    return num_elements(vec);
+                  })));
+            })));
+      })));
+  ASSERT(dim3_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}},
+      DataType::FLOAT,
+  };
+
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) {
+          accessor.at<DataType::FLOAT>(
+              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
+              contents.at(dim0_idx.unwrap_nonnegative())
+                  .at(dim1_idx.unwrap_nonnegative())
+                  .at(dim2_idx.unwrap_nonnegative())
+                  .at(dim3_idx.unwrap_nonnegative());
+        }
+      }
+    }
+  }
+
+  return accessor;
+}
+
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_1d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_2d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_3d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_4d_accessor_w_with_contents(contents, allocator));
+}
+
+template <DataType DT>
+struct CreateRandomFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    using T = real_type_t<DT>;
+    T *data_ptr = src_accessor.get<DT>();
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    size_t num_elements = get_num_elements(shape).unwrap_nonnegative();
+    if constexpr (std::is_same<T, bool>::value) {
+      std::bernoulli_distribution dist(0.5);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_floating_point<T>::value) {
+      std::uniform_real_distribution<T> dist(-1.0, 1.0);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_integral<T>::value) {
+      std::uniform_int_distribution<T> dist(0, 99);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    }
+
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  return DataTypeDispatch1<CreateRandomFilledAccessorW>{}(
+      shape.data_type, shape, allocator);
+}
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_random_filled_accessor_w(shape, allocator);
+
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+template <DataType DT>
+struct FillWithZeros {
+  void operator()(GenericTensorAccessorW const &accessor) {
+    using T = real_type_t<DT>;
+
+    if (accessor.device_type == DeviceType::CPU) {
+      memset(accessor.ptr,
+             0,
+             accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T));
+    } else {
+      checkCUDA(cudaMemset(accessor.ptr,
+                           0,
+                           accessor.shape.get_volume().unwrap_nonnegative() *
+                               sizeof(T)));
+    }
+  }
+};
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor) {
+  DataTypeDispatch1<FillWithZeros>{}(accessor.data_type, accessor);
+}
+
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    int volume = accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
+}
+
+template <DataType DT>
+struct AccessorsAreEqual {
+  bool operator()(GenericTensorAccessorR const &accessor_a,
+                  GenericTensorAccessorR const &accessor_b) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorR cpu_accessor_a =
+        copy_tensor_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator);
+    GenericTensorAccessorR cpu_accessor_b =
+        copy_tensor_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_data_ptr = cpu_accessor_a.get<DT>();
+    T const *b_data_ptr = cpu_accessor_b.get<DT>();
+
+    int volume = accessor_a.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      if (a_data_ptr[i] != b_data_ptr[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  ASSERT(accessor_a.shape == accessor_b.shape,
+         "accessors_are_equal expects accessors to have the same shape");
+
+  return DataTypeDispatch1<AccessorsAreEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
+}
+
+template <DataType DT>
+struct CreateFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator,
+                                    DataTypeValue val) {
+    using T = real_type_t<DT>;
+    if (!val.template has<T>()) {
+      throw mk_runtime_error("create_filed_accessor expected data type of "
+                             "shape and passed-in value to match");
+    }
+
+    auto unwrapped_value = val.get<T>();
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    T *data_ptr = src_accessor.get<DT>();
+
+    int volume = dst_accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      data_ptr[i] = unwrapped_value;
+    }
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+
+  return DataTypeDispatch1<CreateFilledAccessorW>{}(
+      shape.data_type, shape, allocator, val);
+}
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+  GenericTensorAccessorW w_accessor =
+      create_filled_accessor_w(shape, allocator, val);
+  return read_only_accessor_from_write_accessor(w_accessor);
+}
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h
new file mode 100644
index 0000000000..a4fc9b88c8
--- /dev/null
+++ b/lib/kernels/test/src/internal/test_utils.h
@@ -0,0 +1,78 @@
+#ifndef _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H
+#define _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H
+
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/device.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/datatype.h"
+#include "op-attrs/datatype_value.dtg.h"
+#include <doctest/doctest.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace FlexFlow {
+
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator);
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator);
+
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator);
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator);
+
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator);
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator);
+
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator);
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator);
+
+bool contains_non_zero(GenericTensorAccessorR const &accessor);
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor);
+
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
+                                       std::ostream &stream);
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
new file mode 100644
index 0000000000..98f8471212
--- /dev/null
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -0,0 +1,73 @@
+#include "kernels/accessor.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("calculate_accessor_offset") {
+    SUBCASE("one dimension") {
+      std::vector<nonnegative_int> indices = {4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              13_n,
+          },
+      };
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("multiple dimensions") {
+      std::vector<nonnegative_int> indices = {2_n, 4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 2_n * 5_n + 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("zero dimensions") {
+      std::vector<nonnegative_int> indices = {};
+      ArrayShape shape = ArrayShape{std::vector<nonnegative_int>{}};
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 0_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("index and shape dimensions do not match") {
+      std::vector<nonnegative_int> indices = {1_n, 2_n, 4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      CHECK_THROWS(calculate_accessor_offset(indices, shape));
+    }
+
+    SUBCASE("out of bounds index") {
+      std::vector<nonnegative_int> indices = {2_n, 5_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      CHECK_THROWS(calculate_accessor_offset(indices, shape));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc
new file mode 100644
index 0000000000..1fb4c0b541
--- /dev/null
+++ b/lib/kernels/test/src/kernels/array_shape.cc
@@ -0,0 +1,49 @@
+#include "kernels/array_shape.h"
+#include "test/utils/doctest/fmt/unordered_set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_array_coord_set") {
+    SUBCASE("ArrayShape is not empty") {
+      ArrayShape input = ArrayShape{
+          LegionOrdered{2_n, 1_n, 3_n},
+      };
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered{0_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{0_n, 0_n, 1_n}},
+          ArrayCoord{FFOrdered{1_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{1_n, 0_n, 1_n}},
+          ArrayCoord{FFOrdered{2_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{2_n, 0_n, 1_n}},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("ArrayShape has a dimension of size zero") {
+      ArrayShape input = ArrayShape{
+          LegionOrdered{2_n, 0_n, 3_n},
+      };
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("ArrayShape is zero-dimensional") {
+      ArrayShape input = ArrayShape{LegionOrdered<nonnegative_int>{}};
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered<nonnegative_int>{}},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
new file mode 100644
index 0000000000..915a84c335
--- /dev/null
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -0,0 +1,94 @@
+#include "kernels/format_accessor_contents.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("format_accessor_r_contents(GenericTensorAccessorR)") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("accessor is 1d") {
+      GenericTensorAccessorR accessor =
+          create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator);
+
+      std::string correct = "[1 2 3 2]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is 2d") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents(
+          {
+              {1, 2, 3, 5},
+              {4, 3, 3, 2},
+              {1, 1, 5, 8},
+          },
+          cpu_allocator);
+
+      std::string correct = "[\n"
+                            "  [1 2 3 5]\n"
+                            "  [4 3 3 2]\n"
+                            "  [1 1 5 8]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is 3d") {
+      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {1, 2, 3, 6},
+                  {4, 3, 3, 9},
+                  {1, 1, 5, 1},
+              },
+              {
+                  {4, 1, 8, 7},
+                  {9, 4, 2, 4},
+                  {1, 0, 0, 6},
+              },
+              {
+                  {2, 1, 1, 9},
+                  {1, 3, 6, 2},
+                  {1, 9, 8, 9},
+              },
+          },
+          cpu_allocator);
+
+      std::string correct = "[\n"
+                            "  [\n"
+                            "    [1 2 3 6]\n"
+                            "    [4 3 3 9]\n"
+                            "    [1 1 5 1]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [4 1 8 7]\n"
+                            "    [9 4 2 4]\n"
+                            "    [1 0 0 6]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [2 1 1 9]\n"
+                            "    [1 3 6 2]\n"
+                            "    [1 9 8 9]\n"
+                            "  ]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is some other dimension") {
+      GenericTensorAccessorR accessor =
+          create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator);
+
+      CHECK_THROWS(format_accessor_r_contents(accessor));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc
new file mode 100644
index 0000000000..34822ed1c3
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_dim.cc
@@ -0,0 +1,32 @@
+#include "kernels/legion_dim.h"
+#include "test/utils/doctest/fmt/set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("key_range(LegionOrdered<T>)") {
+    SUBCASE("input is non-empty") {
+      LegionOrdered<int> input = {5, 3, 2, 3};
+
+      std::set<legion_dim_t> result = key_range(input);
+      std::set<legion_dim_t> correct = {
+          legion_dim_t{0_n},
+          legion_dim_t{1_n},
+          legion_dim_t{2_n},
+          legion_dim_t{3_n},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is empty") {
+      LegionOrdered<int> input = {};
+
+      std::set<legion_dim_t> result = key_range(input);
+      std::set<legion_dim_t> correct = {};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
new file mode 100644
index 0000000000..4b50cad735
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE_TEMPLATE(
+      "Arbitrary<LegionOrdered<T>> with T=", T, int, double, char) {
+    RC_SUBCASE([](LegionOrdered<T>) {});
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/slice.cc b/lib/kernels/test/src/kernels/legion_ordered/slice.cc
new file mode 100644
index 0000000000..d0211d270e
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/slice.cc
@@ -0,0 +1,30 @@
+#include "kernels/legion_ordered/slice.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("slice(LegionOrdered<T>, ..., ...") {
+    LegionOrdered<size_t> d = LegionOrdered<size_t>{
+        1,
+        2,
+        3,
+        4,
+    };
+    SUBCASE("legion_dim_t, legion_dim_t") {
+      LegionOrdered<size_t> result = slice(d,
+                                           legion_dim_t{nonnegative_int{1}},
+                                           legion_dim_t{nonnegative_int{3}});
+      LegionOrdered<size_t> correct = LegionOrdered<size_t>{2, 3};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("legion_dim_t, std::nullopt_t") {
+      LegionOrdered<size_t> result =
+          slice(d, legion_dim_t{nonnegative_int{1}}, std::nullopt);
+      LegionOrdered<size_t> correct = LegionOrdered<size_t>{2, 3, 4};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
new file mode 100644
index 0000000000..759507264f
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
@@ -0,0 +1,36 @@
+#include "kernels/legion_ordered/transform.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("transform(LegionOrdered<T>, F)") {
+    SUBCASE("input is empty") {
+      LegionOrdered<std::string> input = {};
+
+      LegionOrdered<int> result =
+          transform(input, [](std::string const &) -> int {
+            CHECK(false);
+            return 0;
+          });
+      LegionOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is not empty") {
+      LegionOrdered<int> input = {2, 1, 2, 5};
+
+      LegionOrdered<std::string> result =
+          transform(input, [](int x) { return fmt::to_string(x); });
+      LegionOrdered<std::string> correct = LegionOrdered<std::string>{
+          "2",
+          "1",
+          "2",
+          "5",
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 64264f6c39..9064ae4824 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/attention_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test multi-head attention kernel") {
     nonnegative_int num_samples = 10_n;
     nonnegative_int num_heads = 4_n;
@@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -39,16 +41,26 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
         /*add_bias_kv=*/false);
 
-    TensorShape query_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, qSize});
-    TensorShape key_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, kSize});
-    TensorShape value_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, vSize});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, oProjSize});
-    TensorShape weight_shape = make_float_tensor_shape_from_legion_dims(
-        {nonnegative_int{state.weightSize}});
+    TensorShape query_shape = TensorShape{
+        TensorDims{FFOrdered{qoSeqLength, num_samples, qSize}},
+        DataType::FLOAT,
+    };
+    TensorShape key_shape = TensorShape{
+        TensorDims{FFOrdered{kvSeqLength, num_samples, kSize}},
+        DataType::FLOAT,
+    };
+    TensorShape value_shape = TensorShape{
+        TensorDims{FFOrdered{kvSeqLength, num_samples, vSize}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{qoSeqLength, num_samples, oProjSize}},
+        DataType::FLOAT,
+    };
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered{nonnegative_int{state.weightSize}}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
@@ -72,9 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           weight_accessor.get_float_ptr(),
           output_accessor.get_float_ptr());
 
-      std::vector<float> host_output = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index cacd5b60fb..5f63b48198 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/batch_matmul_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchMatmul Kernel") {
     nonnegative_int m = 10_n;
     nonnegative_int n = 10_n;
@@ -15,16 +15,24 @@ TEST_SUITE(FF_TEST_SUITE) {
     int seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape_a =
-        make_float_tensor_shape_from_legion_dims({m, k, batch});
-    TensorShape input_shape_b =
-        make_float_tensor_shape_from_legion_dims({k, n, batch});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({m, n, batch});
+    TensorShape input_shape_a = TensorShape{
+        TensorDims{FFOrdered{batch, k, m}},
+        DataType::FLOAT,
+    };
+    TensorShape input_shape_b = TensorShape{
+        TensorDims{FFOrdered{batch, n, k}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{batch, n, m}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW a_accessor =
         create_random_filled_accessor_w(input_shape_a, allocator);
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index b4c43cf1d8..903ad8cc43 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,10 +1,11 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/batch_norm_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
     nonnegative_int output_n = 1_n;
     nonnegative_int output_c = 10_n;
@@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -26,25 +29,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*output_w=*/output_w.unwrap_nonnegative(),
         /*relu=*/true);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape scale_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape bias_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape scale_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape bias_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
-    GenericTensorAccessorW scale_accessor =
-        create_filled_accessor_w(scale_shape, allocator, 1.0f);
+    GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
+        scale_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorW bias_accessor =
-          create_filled_accessor_w(bias_shape, allocator, 0.0f);
+      GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
+          bias_shape, allocator, make_float_data_type_value(0));
 
       Kernels::BatchNorm::forward_kernel(
           /*stream=*/managed_stream.raw_stream(),
@@ -54,10 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*scale_ptr=*/scale_accessor.get_float_ptr(),
           /*bias_ptr=*/bias_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
@@ -73,9 +81,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::BatchNorm::backward_kernel(
           /*stream=*/managed_stream.raw_stream(),
           /*per_device_state=*/state,
-          /*input_ptr=*/input_accessor.get_float_ptr(),
-          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
           /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
+          /*input_ptr=*/input_accessor.get_float_ptr(),
           /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(),
           /*scale_ptr=*/scale_accessor.get_float_ptr(),
           /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
@@ -83,19 +91,9 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*numElements=*/
           input_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      std::vector<float> host_scale_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(scale_grad_accessor));
-      std::vector<float> host_bias_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(bias_grad_accessor));
-
-      CHECK(contains_non_zero(host_input_grad_data));
-      CHECK(contains_non_zero(host_scale_grad_data));
-      CHECK(contains_non_zero(host_bias_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+      CHECK(contains_non_zero(scale_grad_accessor));
+      CHECK(contains_non_zero(bias_grad_accessor));
     }
 
     Kernels::BatchNorm::cleanup_kernel(allocator,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 0e0769014d..0c41fe12ac 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,56 +1,86 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/cast_kernels.h"
-#include "test_utils.h"
-#include <type_traits>
+#include "kernels/cast_kernels_cpu.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Cast Forward and Backward Kernels") {
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
-    TensorShape output_shape =
-        make_double_tensor_shape_from_legion_dims({100_n, 100_n});
-
-    GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::DOUBLE,
+    };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-
-      Kernels::Cast::forward_kernel(managed_stream.raw_stream(),
-                                    input_accessor,
-                                    output_accessor,
-                                    DataType::FLOAT,
-                                    DataType::DOUBLE);
+          create_random_filled_accessor_r(input_shape, allocator);
+      GenericTensorAccessorW output_accessor =
+          allocator.allocate_tensor(output_shape);
 
-      std::vector<double> host_double_data =
-          load_data_to_host_from_device<double>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      CHECK(contains_non_zero(host_double_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
+      GenericTensorAccessorR grad_output_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW grad_input_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Cast::backward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(output_accessor),
-          grad_input_accessor,
-          DataType::DOUBLE,
-          DataType::FLOAT);
-
-      std::vector<float> host_grad_float_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(grad_input_accessor));
-      CHECK(contains_non_zero(host_grad_float_data));
+          create_zero_filled_accessor_w(input_shape, allocator);
+
+      Kernels::Cast::backward_kernel(managed_stream.raw_stream(),
+                                     grad_output_accessor,
+                                     grad_input_accessor);
+
+      CHECK(contains_non_zero(grad_input_accessor));
+    }
+  }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 2_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 2_n}},
+        DataType::DOUBLE,
+    };
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      // Run GPU Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Cast::cpu_forward_kernel(input_accessor_cpu,
+                                        output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2b6b9bf589..2040dcbd5d 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -1,39 +1,39 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/combine_kernels.h"
-#include "test_utils.h"
+#include "kernels/combine_kernels_cpu.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Call Combine Forward and Backward Kernels") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Combine::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -41,9 +41,66 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_grad_accessor,
                                         input_grad_accessor);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Combine Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{5_n, 5_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = input_shape;
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu);
+
+      // Run CPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 215e599716..c2df907917 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,56 +1,113 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/concat_kernels.h"
-#include "test_utils.h"
 #include "utils/containers/repeat.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    nonnegative_int num_inputs = 3_n;
-    nonnegative_int size_per_input = 100_n;
-    ff_dim_t concat_axis = ff_dim_t{0_n};
-
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs});
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    const nonnegative_int num_inputs = 4_n;
+
     SUBCASE("forward_kernel") {
-      std::vector<GenericTensorAccessorR> input_accessors =
-          repeat(num_inputs, [&]() {
-            return read_only_accessor_from_write_accessor(
-                create_random_filled_accessor_w(input_shape, allocator));
-          });
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
-                                      output_accessor,
-                                      input_accessors,
-                                      concat_axis);
-
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_data));
+      auto run_forward_test = [&](nonnegative_int input_rows,
+                                  nonnegative_int input_cols,
+                                  TensorShape output_shape,
+                                  ff_dim_t concat_axis) {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+
+        std::vector<GenericTensorAccessorR> input_accessors =
+            repeat(num_inputs, [&]() {
+              return create_random_filled_accessor_r(input_shape, allocator);
+            });
+
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
+                                        output_accessor,
+                                        input_accessors,
+                                        concat_axis);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test forward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test forward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
-          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
-      Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
-                                       output_grad_accessor,
-                                       input_grad_accessors,
-                                       concat_axis);
+      auto run_backward_test = [&](nonnegative_int input_rows,
+                                   nonnegative_int input_cols,
+                                   TensorShape output_shape,
+                                   ff_dim_t concat_axis) {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+
+        std::vector<GenericTensorAccessorW> input_grad_accessors =
+            repeat(num_inputs, [&]() {
+              return create_zero_filled_accessor_w(input_shape, allocator);
+            });
+
+        Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
+                                         output_grad_accessor,
+                                         input_grad_accessors,
+                                         concat_axis);
+
+        for (auto &accessor : input_grad_accessors) {
+          CHECK(contains_non_zero(accessor));
+        }
+      };
+
+      SUBCASE("test backward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test backward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc
index ed5852bc31..de3215cf2d 100644
--- a/lib/kernels/test/src/test_cuda.cc
+++ b/lib/kernels/test/src/test_cuda.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
-#include "test_utils.h"
+#include "internal/test_utils.h"
+#include <doctest/doctest.h>
 
 #include <random>
 
 namespace FlexFlow {
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test CUDA") {
     int deviceCount = 0;
 
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 86f8f2102b..409b06d9a9 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,38 +1,37 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/dropout_kernels.h"
-#include "test_utils.h"
 #include "utils/containers/count.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Dropout Kernels") {
     unsigned long long seed = 12345;
     float dropout_rate = 0.1;
 
     ArrayShape shape = ArrayShape{
-        std::vector<nonnegative_int>{10_n, 10_n},
+        std::vector{10_n, 10_n},
     };
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     DropoutPerDeviceState state = Kernels::Dropout::init_kernel(
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
-    auto get_zero_count = [](std::vector<float> const &data) {
-      return count(data, [](float x) { return x == 0.0f; });
-    };
-
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -41,11 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_accessor =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_accessor));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 83f7f0445e..f8a3abdb98 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,21 +1,27 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/flat_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_filled_accessor_w(input_shape, allocator, 2.0f));
+        read_only_accessor_from_write_accessor(create_filled_accessor_w(
+            input_shape, allocator, make_float_data_type_value(2)));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -25,33 +31,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                                     input_accessor,
                                     output_accessor.get_float_ptr());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 0.0f);
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(0));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(1));
 
       Kernels::Flat::backward_kernel(managed_stream.raw_stream(),
                                      input_accessor,
-                                     input_grad_accessor.get_float_ptr(),
-                                     output_grad_accessor.get_float_ptr());
-
-      std::vector<float> backward_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                     output_grad_accessor.get_float_ptr(),
+                                     input_grad_accessor.get_float_ptr());
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(backward_output_data == expected_output_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 1a8cf5f82a..f0be809475 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -1,61 +1,107 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/gather_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
     GatherPerDeviceState state = {managed_handle.raw_handle(),
-                                  legion_dim_t{2_n}};
+                                  legion_dim_t{0_n}};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape input_shape,
+                                  TensorShape index_shape,
+                                  TensorShape output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
 
-    GenericTensorAccessorR index_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(output_shape, allocator));
+        Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
+                                        state,
+                                        input_accessor,
+                                        index_accessor,
+                                        output_accessor);
 
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
-                                      state,
-                                      input_accessor,
-                                      index_accessor,
-                                      output_accessor);
-
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       output_grad_accessor,
-                                       index_accessor,
-                                       input_grad_accessor);
-
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad_data));
+      auto run_backward_test = [&](TensorShape input_shape,
+                                   TensorShape index_shape,
+                                   TensorShape output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
+                                         state,
+                                         output_grad_accessor,
+                                         index_accessor,
+                                         input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 5386c1d943..02a95ba58a 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,23 +1,30 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/layer_norm_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test LayerNorm Forward and Backward Kernel") {
     nonnegative_int batch_size = 10_n;
     nonnegative_int feature_size = 10_n;
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({batch_size, feature_size});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, feature_size}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
-    TensorShape feature_shape =
-        make_float_tensor_shape_from_legion_dims({feature_size});
+    TensorShape feature_shape = TensorShape{
+        TensorDims{FFOrdered{feature_size}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -31,16 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(input_shape, allocator));
-    GenericTensorAccessorW gamma_accessor =
-        create_filled_accessor_w(feature_shape, allocator, 1.0f);
+        create_random_filled_accessor_r(input_shape, allocator);
+    GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(
+        feature_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
-      GenericTensorAccessorW beta_accessor =
-          create_filled_accessor_w(feature_shape, allocator, 0.0f);
+      GenericTensorAccessorW beta_accessor = create_filled_accessor_w(
+          feature_shape, allocator, make_float_data_type_value(0));
 
       Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -52,8 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
new file mode 100644
index 0000000000..fb5920adcc
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -0,0 +1,107 @@
+#include "internal/test_utils.h"
+#include "kernels/gather_kernels.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Test ManagedFFStream") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    GatherPerDeviceState state = {managed_handle.raw_handle(),
+                                  legion_dim_t{0_n}};
+
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape const &input_shape,
+                                  TensorShape const &index_shape,
+                                  TensorShape const &output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(),
+                                        /*per_device_state=*/state,
+                                        /*input=*/input_accessor,
+                                        /*index=*/index_accessor,
+                                        /*output=*/output_accessor);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+    }
+
+    SUBCASE("backward_kernel") {
+      auto run_backward_test = [&](TensorShape const &input_shape,
+                                   TensorShape const &index_shape,
+                                   TensorShape const &output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(),
+                                         /*per_device_state=*/state,
+                                         /*output_grad=*/output_grad_accessor,
+                                         /*index=*/index_accessor,
+                                         /*input_grad=*/input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..fc67764cdb
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -0,0 +1,37 @@
+#include "kernels/managed_per_device_ff_handle.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Test ManagedPerDeviceFFHandle") {
+    ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024,
+                                         /*allowTensorOpMathConversion=*/true};
+    PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
+
+    SUBCASE("constructor") {
+      CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
+      CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true);
+    }
+
+    SUBCASE("move constructor") {
+      ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
+
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedPerDeviceFFHandle new_handle{
+            /*workSpaceSize=*/1024 * 1024,
+            /*allowTensorOpMathConversion=*/true};
+        new_handle = std::move(base_handle);
+        CHECK(&new_handle.raw_handle() == base_handle_ptr);
+      }
+
+      SUBCASE("move assign to self") {
+        base_handle = std::move(base_handle);
+        CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 4fd1b53210..5452266dad 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,12 +1,15 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/partition_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -14,48 +17,36 @@ TEST_SUITE(FF_TEST_SUITE) {
     RepartitionPerDeviceState state = Kernels::Repartition::init_kernel(
         managed_handle.raw_handle(), DataType::FLOAT);
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+      GenericTensorAccessorR input_accessor = create_filled_accessor_r(
+          input_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Repartition::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(2));
 
       Kernels::Repartition::backward_kernel(managed_stream.raw_stream(),
                                             state,
-                                            input_grad_accessor,
-                                            output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                            output_grad_accessor,
+                                            input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 62b61707c6..f2ada8387e 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,9 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/pool_2d_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Pool2D Forward and Backward Kernel") {
     nonnegative_int input_w = 10_n;
     nonnegative_int input_h = 10_n;
@@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -46,10 +49,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      /*stride_w=*/stride_w.unwrap_nonnegative(),
                                      /*pool_type=*/pool_type);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {input_w, input_h, input_c, input_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_w, output_h, output_c, output_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{input_n, input_c, input_h, input_w}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, input_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
@@ -62,28 +69,23 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessor.ptr,
                                       output_accessor.ptr);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(),
                                        state,
-                                       input_accessor.ptr,
-                                       input_grad_accessor.ptr,
                                        output_accessor.ptr,
-                                       output_grad_accessor.ptr);
+                                       output_grad_accessor.ptr,
+                                       input_accessor.ptr,
+                                       input_grad_accessor.ptr);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 04a3817b84..e13b149769 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,27 +1,33 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reduction_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {10_n, 10_n, 10_n, 10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      TensorShape output_shape =
-          make_float_tensor_shape_from_legion_dims({10_n});
+      TensorShape output_shape = TensorShape{
+          TensorDims{FFOrdered{10_n}},
+          DataType::FLOAT,
+      };
 
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -30,30 +36,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          output_accessor,
                                          num_replicas);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       TensorShape output_shape = input_shape;
 
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Reduction::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor);
-
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      std::vector<float> host_grad_data = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_grad_data == expected_grad_input_data);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
+
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index fa726898f2..83a9a992f7 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,55 +1,150 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels.h"
-#include "test_utils.h"
+#include "kernels/replicate_kernels_cpu.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Replicate Kernel") {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Call Replicate Forward and Backward Kernels") {
     nonnegative_int num_replicas = 10_n;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = input_shape;
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    Allocator allocator = create_local_cuda_memory_allocator();
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
+      GenericTensorAccessorR input =
+          create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator);
+
+      GenericTensorAccessorW output =
+          gpu_allocator.allocate_tensor(output_shape);
 
       Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input_accessor, output_accessor);
+          managed_stream.raw_stream(), input, output);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      GenericTensorAccessorR correct = input;
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK_MESSAGE(accessors_are_equal(output, correct),
+                    check_kv("output", format_accessor_w_contents(output)));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents(
+          {
+              {1, 2, 3},
+              {4, 3, 3},
+              {1, 3, 5},
+          },
+          gpu_allocator);
+
+      GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+          {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+
+      GenericTensorAccessorW input_grad =
+          gpu_allocator.allocate_tensor(input_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor,
+                                          output_grad,
+                                          input_grad,
                                           num_replicas.unwrap_nonnegative());
 
-      std::vector<float> check_aggregated_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(check_aggregated_data));
+      CHECK_MESSAGE(
+          accessors_are_equal(input_grad, correct),
+          check_kv("input_grad", format_accessor_w_contents(input_grad)));
+    }
+  }
+
+  TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
+    nonnegative_int num_replicas = 2_n;
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{5_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{5_n, num_replicas}},
+        DataType::FLOAT,
+    };
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
+
+      CHECK_MESSAGE(
+          accessors_are_equal(output_accessor_gpu, output_accessor_cpu),
+          check_kv("input", format_accessor_r_contents(input_accessor_cpu)),
+          check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)),
+          check_kv("cpu", format_accessor_w_contents(output_accessor_cpu)));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          output_grad_accessor_gpu,
+                                          input_grad_accessor_gpu,
+                                          num_replicas.unwrap_nonnegative());
+
+      // Run CPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_backward_kernel(
+          output_grad_accessor_cpu,
+          input_grad_accessor_cpu,
+          num_replicas.unwrap_nonnegative());
+
+      CHECK_MESSAGE(
+          accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu),
+          check_kv("output_grad",
+                   format_accessor_r_contents(output_grad_accessor_cpu)),
+          check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)),
+          check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu)));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index d329a347b3..66c6bf849b 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -1,16 +1,21 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reshape_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
@@ -18,42 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reshape::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reshape::backward_kernel(managed_stream.raw_stream(),
                                         state,
-                                        input_grad_accessor,
-                                        output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                        output_grad_accessor,
+                                        input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 9c8475f6d6..6e12c48ac3 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,63 +1,124 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reverse_kernels.h"
-#include "test_utils.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
-    nonnegative_int reverse_dim_size = 10_n;
-    nonnegative_int in_blk_size = 10_n;
-    nonnegative_int num_out_blks = 1_n;
-
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{1_n, 10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    ReverseAttrs attrs = ReverseAttrs{
+        /*axis=*/ff_dim_t{0_n},
+    };
+
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          read_only_accessor_from_write_accessor(create_filled_accessor_w(
+              input_shape, allocator, make_float_data_type_value(1)));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reverse::forward_kernel(
-          managed_stream.raw_stream(),
-          input_accessor.get_float_ptr(),
-          output_accessor.get_float_ptr(),
-          num_out_blks.unwrap_nonnegative(),
-          reverse_dim_size.unwrap_nonnegative(),
-          in_blk_size.unwrap_nonnegative(),
-          input_accessor.shape.num_elements().unwrap_nonnegative());
-
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(check_output_data));
+          managed_stream.raw_stream(), input_accessor, output_accessor, attrs);
+
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Reverse::backward_kernel(
-          managed_stream.raw_stream(),
-          output_grad_accessor.get_float_ptr(),
-          input_grad_accessor.get_float_ptr(),
-          num_out_blks.unwrap_nonnegative(),
-          reverse_dim_size.unwrap_nonnegative(),
-          in_blk_size.unwrap_nonnegative(),
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative());
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+          allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor,
+                                        input_grad_accessor,
+                                        attrs);
+
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{1_n, 4_n, 3_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    ReverseAttrs attrs = ReverseAttrs{
+        /*axis=*/ff_dim_t{0_n},
+    };
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu,
+                                       output_accessor_gpu,
+                                       attrs);
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(
+          input_accessor_cpu, output_accessor_cpu, attrs);
+
+      CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu,
+                                        attrs);
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_backward_kernel(
+          output_grad_accessor_cpu, input_grad_accessor_cpu, attrs);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index c9eaa76b86..904cca2d3e 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/softmax_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
     nonnegative_int input_n = 1_n;
     nonnegative_int input_c = 1_n;
@@ -12,12 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int input_w = 100_n;
     nonnegative_int channels = 100_n;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state =
@@ -40,30 +45,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Softmax::backward_kernel(
           managed_stream.raw_stream(),
-          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.get_float_ptr(),
+          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> expected_input_grad_data = std::vector<float>(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_input_grad_data == expected_input_grad_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index ea0d280f68..44e8f42f76 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,24 +1,33 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/split_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
 #include "utils/containers/repeat.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Split Forward and Backward Kernel") {
     nonnegative_int num_outputs = 2_n;
     coord_t out_blk_sizes[] = {50, 50};
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{50_n}},
+        DataType::FLOAT,
+    };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
@@ -47,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 0.0f);
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(0));
 
       Kernels::Split::backward_kernel(managed_stream.raw_stream(),
                                       input_grad_accessor.get_float_ptr(),
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 02d99c86a1..3c15661396 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -1,58 +1,54 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/transpose_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     TransposeAttrs attrs = TransposeAttrs{
-        FFOrdered<ff_dim_t>{
-            ff_dim_t{0_n},
+        FFOrdered{
             ff_dim_t{1_n},
+            ff_dim_t{0_n},
         },
     };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
           managed_stream.raw_stream(), attrs, input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           attrs,
-                                          input_grad_accessor,
-                                          output_grad_accessor);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
deleted file mode 100644
index 903b666fa9..0000000000
--- a/lib/kernels/test/src/test_utils.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-#include "test_utils.h"
-
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-
-  for (auto &val : host_data) {
-    val = dist(gen);
-  }
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume);
-
-  for (size_t i = 0; i < volume; i++) {
-    host_data[i] = i;
-  }
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill) {
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-}
-
-TensorShape
-    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::FLOAT,
-  };
-}
-
-TensorShape
-    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::DOUBLE,
-  };
-}
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
deleted file mode 100644
index 08f0f382fb..0000000000
--- a/lib/kernels/test/src/test_utils.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef _FLEXFLOW_KERNELS_TEST_UTILS
-#define _FLEXFLOW_KERNELS_TEST_UTILS
-
-#include "kernels/device.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include <doctest/doctest.h>
-#include <random>
-#include <sstream>
-#include <string>
-#include <vector>
-
-using namespace FlexFlow;
-
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill = false);
-
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill = false);
-
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill = false);
-
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill = false);
-
-TensorShape
-    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
-
-TensorShape
-    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
-
-template <typename T>
-std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
-  int volume = accessor.shape.get_volume();
-
-  std::vector<T> local_data(volume);
-  checkCUDA(cudaMemcpy(local_data.data(),
-                       accessor.ptr,
-                       local_data.size() * sizeof(T),
-                       cudaMemcpyDeviceToHost));
-  return local_data;
-}
-
-template <typename T>
-bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(
-      data.begin(), data.end(), [](T const &val) { return val == 0; });
-}
-
-// Specialize doctest's StringMaker for std::vector<float>
-template <>
-struct doctest::StringMaker<std::vector<float>> {
-  static doctest::String convert(std::vector<float> const &vec) {
-    std::ostringstream oss;
-    for (size_t i = 0; i < vec.size(); ++i) {
-      oss << vec[i];
-      if (i != vec.size() - 1) {
-        oss << ", ";
-      }
-    }
-    return doctest::String(("[" + oss.str() + "]").c_str());
-  }
-};
-
-#endif
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h
index 1edd5b6360..f1f357a86e 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 54c8dfc5f1..48584588e3 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific.h"
 #include "local-execution/itask_argument_accessor.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 731e04fdc8..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator {
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
   size_t get_current_mem_usage();
 
 private:
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 54eca7e514..5d099c6b46 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_grad_pair));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, IsGrad is_grad) const {
   SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
@@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 1df6da8d8e..5cf8742918 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -134,9 +134,9 @@ static std::optional<float>
                  profiling,
                  "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output_grad.get_float_ptr(),
                  output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 3e7baf49a9..e9adf88422 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -54,9 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input,
-                 output,
-                 input.data_type,
-                 attrs.dtype);
+                 output);
 }
 
 static std::optional<float>
@@ -73,9 +71,7 @@ static std::optional<float>
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
-                 output_grad,
-                 input.data_type,
-                 attrs.dtype);
+                 output_grad);
 }
 
 TaskImplFunction get_cast_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index bb1504a3f5..55ff354483 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -107,8 +107,8 @@ static std::optional<float>
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto filter = acc.get_tensor<Permissions::RO>(FILTER);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -120,10 +120,10 @@ static std::optional<float>
                  profiling,
                  "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  filter.get_float_ptr(),
                  filter_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index c5ff9199f3..311b8e7924 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -58,8 +58,10 @@ static DeviceSpecificDeviceStates
 
   ParallelTensorShape output_shape =
       throw_if_unexpected(get_output_shape(attrs, input_shape));
-  ElementUnaryPerDeviceState per_device_state = init_kernel(
-      get_piece_shape(input_shape), get_piece_shape(output_shape), attrs);
+  ElementUnaryPerDeviceState per_device_state =
+      init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)),
+                  array_shape_from_tensor_shape(get_piece_shape(output_shape)),
+                  attrs);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<ElementUnaryPerDeviceState>::create(per_device_state)};
@@ -88,10 +90,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -106,10 +108,10 @@ static std::optional<float>
                  per_device_state,
                  attrs,
                  handle,
-                 input,
-                 input_grad,
                  output,
-                 output_grad);
+                 output_grad,
+                 input,
+                 input_grad);
 }
 
 TaskImplFunction get_element_unary_init_task_impl() {
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 0f872b5d50..af6fc16272 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -40,15 +40,15 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Flat] backward_time = {:.2lf}ms\n",
                  input,
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_flat_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 6f0901e66a..9641cdbd4a 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -26,9 +26,9 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
   binding.bind_arg(HANDLE, ff_handle());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(WEIGHT, weight_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
   return {task_id_t::LINEAR_INIT_TASK_ID, binding};
 }
@@ -36,11 +36,11 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
 OpTaskInvocation forward(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(WEIGHT, weight_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
   if (attrs.use_bias) {
-    binding.bind(BIAS, weight_tensor(1)); // bias
+    binding.bind(BIAS, weight_tensor(1));
   }
 
   binding.bind_arg(PROFILING, profiling_settings());
@@ -124,20 +124,21 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
+  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
+
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  float const *bias_ptr = NULL;
+  float *bias_grad_ptr = NULL;
   if (attrs.use_bias) {
-    bias_ptr = bias.get_float_ptr();
+    auto bias_grad = acc.get_tensor_grad<Permissions::RW>(BIAS);
+    bias_grad_ptr = bias_grad.get_float_ptr();
   }
 
   nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
@@ -148,13 +149,13 @@ static std::optional<float>
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 (void *)input.get_float_ptr(),
-                 (void *)input_grad.get_float_ptr(),
-                 (void *)output.get_float_ptr(),
-                 (void *)output_grad.get_float_ptr(),
-                 (void *)weight.get_float_ptr(),
-                 (void *)weight_grad.get_float_ptr(),
-                 (void *)bias_ptr,
+                 output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
+                 weight.get_float_ptr(),
+                 weight_grad.get_float_ptr(),
+                 bias_grad_ptr,
                  in_dim.unwrap_nonnegative(),
                  out_dim.unwrap_nonnegative(),
                  batch_size.unwrap_nonnegative());
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index fb0635efba..f85874dc0a 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -115,19 +115,19 @@ static std::optional<float>
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_pool_2d_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index ee1a7c6c4e..b07d9fe965 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -63,13 +63,13 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   return profile(backward_kernel,
                  profiling,
                  "[Reduction] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reduction_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index 6c0c813c8d..7b6e9fe2f6 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -85,8 +85,8 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index d3ada35d93..99aeb913ba 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -66,8 +66,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
-                 input_grad,
                  output_grad,
+                 input_grad,
                  attrs.replicate_degree.unwrap_nonnegative());
 }
 
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index fc3a75607d..e382b2668e 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -86,8 +86,8 @@ static std::optional<float>
                  profiling,
                  "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reshape_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index ddd47d355d..00f56c6892 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -48,30 +48,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  nonnegative_int output_size = output.shape.get_volume();
-  auto axis = attrs.axis;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) {
-    if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t{i});
-    } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= output.shape.at(ff_dim_t{i});
-    }
-  }
-
   return profile(forward_kernel,
                  profiling,
                  "[reverse] forward_time = {:.2lf}ms\n",
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 output_size.unwrap_nonnegative());
+                 input,
+                 output,
+                 attrs);
 }
 
 static std::optional<float>
@@ -81,30 +63,12 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input_grad.shape.num_dims().unwrap_nonnegative() -
-             attrs.axis.value.unwrap_nonnegative() - 1;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) {
-    if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
-    } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
-    }
-  }
-
   return profile(backward_kernel,
                  profiling,
                  "[reverse] backward_time = {:.2lf}ms\n",
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 input_grad.shape.get_volume().unwrap_nonnegative());
+                 output_grad,
+                 input_grad,
+                 attrs);
 }
 
 TaskImplFunction get_reverse_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 0e94422c5f..e008098e05 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -106,8 +106,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[SoftMax] backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  output_grad.shape.get_volume().unwrap_nonnegative());
 }
 
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 4146836b9a..1859bb0ccc 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -67,8 +67,8 @@ static std::optional<float>
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
                  attrs,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 OpTaskInvocation backward(TransposeAttrs const &attrs) {
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/local-execution/src/per_device_op_state.cc
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index e6c3a11711..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
+DeviceType TrackedAllocator::get_allocation_device_type() const {
+  return this->allocator.get_allocation_device_type();
+}
+
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
-  return Allocator::create<TrackedAllocator>(base_allocator);
+  Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index da3af6e3ad..9f8b4092c1 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,68 +12,71 @@
 // TEST_SUITE(FF_CUDA_TEST_SUITE) {
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle{};
+//     ManagedPerDeviceFFHandle managed_handle{
+//       /*workSpaceSize=*/1024 * 1024,
+//       /*allowTensorOpMathConversion=*/true};
 
-//     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-//         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-//         EnableProfiling::YES,
-//         ProfilingSettings{/*warmup_iters=*/0,
-//                           /*measure_iters=*/1}};
+//         RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+//             DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+//             EnableProfiling::YES,
+//             ProfilingSettings{/*warmup_iters=*/0,
+//                               /*measure_iters=*/1}};
 
-//     LocalCostEstimator cost_estimator =
-//     LocalCostEstimator{runtime_arg_config};
+//         LocalCostEstimator cost_estimator =
+//         LocalCostEstimator{runtime_arg_config};
 
-//     SUBCASE("Estimate cost -- Attention Op") {
-//       int embed_dim = 32;
-//       int num_heads = 10;
-//       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
-//           /*embed_dim=*/embed_dim,
-//           /*num_heads=*/num_heads,
-//           /*kdim=*/embed_dim,
-//           /*vdim=*/embed_dim,
-//           /*dropout=*/0.0,
-//           /*bias=*/true,
-//           /*add_bias_kv=*/false,
-//           /*add_zero_attn=*/false,
-//       };
+//         SUBCASE("Estimate cost -- Attention Op") {
+//           int embed_dim = 32;
+//           int num_heads = 10;
+//           MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+//               /*embed_dim=*/embed_dim,
+//               /*num_heads=*/num_heads,
+//               /*kdim=*/embed_dim,
+//               /*vdim=*/embed_dim,
+//               /*dropout=*/0.0,
+//               /*bias=*/true,
+//               /*add_bias_kv=*/false,
+//               /*add_zero_attn=*/false,
+//           };
 
-//       size_t batch_size = 40;
-//       size_t seq_len = 48;
-//       size_t feature_size = 36;
+//           size_t batch_size = 40;
+//           size_t seq_len = 48;
+//           size_t feature_size = 36;
 
-//       DataType dtype = DataType::FLOAT;
-//       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
-//           TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
-//           DataType::FLOAT,
-//       });
+//           DataType dtype = DataType::FLOAT;
+//           ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
+//               TensorDims{FFOrdered<size_t>{batch_size, seq_len,
+//               feature_size}}, DataType::FLOAT,
+//           });
 
-//       ParallelTensorShape weights_shape = throw_if_unexpected(
-//           get_weights_shape(attrs, inputs_shape, inputs_shape,
-//           inputs_shape));
-//       ParallelTensorAttrs weight_attrs =
-//           ParallelTensorAttrs{weights_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape weights_shape = throw_if_unexpected(
+//               get_weights_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs weight_attrs =
+//               ParallelTensorAttrs{weights_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       ParallelTensorShape output_shape = throw_if_unexpected(
-//           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-//       ParallelTensorAttrs output_attrs =
-//           ParallelTensorAttrs{output_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape output_shape = throw_if_unexpected(
+//               get_output_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs output_attrs =
+//               ParallelTensorAttrs{output_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       CostDetails result = cost_estimator.estimate_cost(
-//           PCGOperatorAttrs{attrs},
-//           std::vector<ParallelTensorShape>{
-//               inputs_shape, inputs_shape, inputs_shape},
-//           std::vector<ParallelTensorAttrs>{weight_attrs},
-//           std::vector<ParallelTensorAttrs>{output_attrs},
-//           make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
+//           CostDetails result = cost_estimator.estimate_cost(
+//               PCGOperatorAttrs{attrs},
+//               std::vector<ParallelTensorShape>{
+//                   inputs_shape, inputs_shape, inputs_shape},
+//               std::vector<ParallelTensorAttrs>{weight_attrs},
+//               std::vector<ParallelTensorAttrs>{output_attrs},
+//               make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
 
-//       CHECK(result.total_elapsed_time > 0);
-//       CHECK(result.total_mem_usage > 0);
+//           CHECK(result.total_elapsed_time > 0);
+//           CHECK(result.total_mem_usage > 0);
+//         }
+//       }
 //     }
-//   }
-// }
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index dffb19398c..e55d1eddf5 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -1,6 +1,6 @@
 #include "kernels/attention_kernels.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/local_slots_backing.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
@@ -106,24 +106,24 @@ TEST_SUITE(FF_TEST_SUITE) {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   query_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{query_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(query_shape), dtype};
           CHECK(result == correct);
         }
         SUBCASE("Key grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   key_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{key_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(key_shape), dtype};
           CHECK(result == correct);
         }
         SUBCASE("Value grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   value_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{value_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(value_shape), dtype};
           CHECK(result == correct);
         }
       }
@@ -135,9 +135,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   output_guid, local_slots_backing.tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
+              array_shape_from_tensor_shape(
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
+                      .shape),
               dtype};
           CHECK(result == correct);
         }
@@ -146,9 +146,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   output_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
+              array_shape_from_tensor_shape(
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
+                      .shape),
               dtype};
           CHECK(result == correct);
         }
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index 0fab0f6a60..a39bb229e2 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -1,5 +1,5 @@
 #include "doctest/doctest.h"
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/task_signature_impl.h"
 #include "utils/fmt/variant.h"
diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
index 27aa50f38f..09ee99915d 100644
--- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
+++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
@@ -10,5 +10,6 @@ features = [
 [[values]]
 name = "SUM"
 
-[[value]]
+[[values]]
 name = "AVG"
+
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
new file mode 100644
index 0000000000..723e69bddd
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index f2355289dc..5c47745209 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -17,13 +17,9 @@ struct DimOrdered {
   DimOrdered(std::initializer_list<T> const &l)
       : contents(l.begin(), l.end()) {}
 
-  /* template <typename I, typename std::enable_if<std::is_convertible<I,
-   * T>::value>::type> */
   DimOrdered(std::vector<T> const &contents)
       : contents(contents.begin(), contents.end()) {}
 
-  /* template <typename It, typename std::enable_if<std::is_convertible<typename
-   * It::value_type, T>::value>::type> */
   template <typename It>
   DimOrdered(It begin, It end) : contents(begin, end) {}
 
@@ -62,10 +58,6 @@ struct DimOrdered {
     return this->contents != other.contents;
   }
 
-  bool operator<(DimOrdered const &other) const {
-    return this->contents < other.contents;
-  }
-
   using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
   using const_iterator =
       typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
@@ -116,7 +108,7 @@ struct DimOrdered {
   }
 
   reverse_iterator rend() {
-    return this->contents.crend();
+    return this->contents.rend();
   }
 
   const_reverse_iterator rend() const {
@@ -145,195 +137,26 @@ struct DimOrdered {
   stack_vector<T, MAX_TENSOR_DIM> contents;
 };
 
-template <typename T>
-struct DimOrdered<ff_dim_t, T> {
-  DimOrdered() {}
-
-  DimOrdered(std::initializer_list<T> const &l)
-      : contents(l.begin(), l.end()) {}
-
-  DimOrdered(std::vector<T> const &contents)
-      : contents(contents.begin(), contents.end()) {}
-
-  template <typename It>
-  DimOrdered(It begin, It end) : contents(begin, end) {}
-
-  template <size_t MAXSIZE>
-  DimOrdered(stack_vector<T, MAXSIZE> const &contents)
-      : contents(contents.begin(), contents.end()) {}
-
-  T const &at(ff_dim_t idx) const {
-    int raw = idx.value.unwrap_nonnegative();
-    return this->contents.at(raw);
-  }
-
-  T const &at(relative_ff_dim_t idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
-  }
-
-  T &at(ff_dim_t idx) {
-    int raw = idx.value.unwrap_nonnegative();
-    return this->contents.at(raw);
-  }
-
-  T &at(relative_ff_dim_t idx) {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
-  }
-
-  T const &operator[](ff_dim_t idx) const {
-    return this->at(idx);
-  }
-
-  T const &operator[](relative_ff_dim_t idx) const {
-    return this->at(idx);
-  }
-
-  T &operator[](ff_dim_t idx) {
-    return this->at(idx);
-  }
-
-  T &operator[](relative_ff_dim_t idx) {
-    return this->at(idx);
-  }
-
-  bool idx_is_valid(ff_dim_t const &idx) const {
-    int raw = idx.value.unwrap_nonnegative();
-    return raw < this->contents.size();
-  }
-
-  bool idx_is_valid(relative_ff_dim_t const &idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return (raw >= 0 && raw < this->contents.size());
-  }
-
-  bool operator==(DimOrdered const &other) const {
-    return this->contents == other.contents;
-  }
-
-  bool operator!=(DimOrdered const &other) const {
-    return this->contents != other.contents;
-  }
-
-  bool operator<(DimOrdered const &other) const {
-    return this->contents < other.contents;
-  }
-
-  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
-  using const_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
-  using reverse_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
-  using const_reverse_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
-  using value_type = T;
-  using pointer = value_type *;
-  using const_pointer = value_type const *;
-  using reference = value_type &;
-  using const_reference = value_type const &;
-
-  iterator begin() {
-    return this->contents.begin();
-  }
-
-  const_iterator begin() const {
-    return this->cbegin();
-  }
-
-  const_iterator cbegin() const {
-    return this->contents.cbegin();
-  }
-
-  iterator end() {
-    return this->contents.end();
-  }
-
-  const_iterator end() const {
-    return this->cend();
-  }
-
-  const_iterator cend() const {
-    return this->contents.cend();
-  }
-
-  reverse_iterator rbegin() {
-    return this->contents.rbegin();
-  }
-
-  const_reverse_iterator rbegin() const {
-    return this->crbegin();
-  }
-
-  const_reverse_iterator crbegin() const {
-    return this->contents.crbegin();
-  }
-
-  reverse_iterator rend() {
-    return this->contents.crend();
-  }
-
-  const_reverse_iterator rend() const {
-    return this->crend();
-  }
-
-  const_reverse_iterator crend() const {
-    return this->contents.crend();
-  }
-
-  size_t size() const {
-    return this->contents.size();
-  }
-
-  size_t empty() const {
-    return this->contents.empty();
-  }
-
-  size_t num_dims() const {
-    return this->size();
-  }
-
-  friend struct ::std::hash<DimOrdered>;
-
-private:
-  stack_vector<T, MAX_TENSOR_DIM> contents;
-};
-
-template <typename T>
-using FFOrdered = DimOrdered<ff_dim_t, T>;
+template <typename Idx, typename T>
+auto operator<(DimOrdered<Idx, T> const &lhs, DimOrdered<Idx, T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
 
-template <typename T>
-std::string format_as(FFOrdered<T> const &v) {
+template <typename Idx, typename T>
+std::string format_as(DimOrdered<Idx, T> const &v) {
   std::vector<T> as_vec(v.cbegin(), v.cend());
   return fmt::format("<ff_ordered {}>", as_vec);
 }
 
-template <typename T>
-std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
+template <typename Idx, typename T>
+std::ostream &operator<<(std::ostream &s, DimOrdered<Idx, T> const &v) {
   return (s << fmt::to_string(v));
 }
 
 } // namespace FlexFlow
 
-/* template <typename Idx, typename T> */
-/* void to_json(json &j, DimOrdered<Idx, T> const &x) { */
-/*   /1* j = std::vector<T>{x.cbegin(), x.cend()}; *1/ */
-/* } */
-
-/* template <typename Idx, typename T> */
-/* void from_json(json const &j, DimOrdered<Idx, T> &x) { */
-/*   /1* x = DimOrdered<Idx, T>{j.template get<std::vector<T>>()}; *1/ */
-/* } */
-
 namespace nlohmann {
 template <typename Idx, typename T>
 struct adl_serializer<::FlexFlow::DimOrdered<Idx, T>> {
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
index 166916dd44..76526447be 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H
 
 #include "op-attrs/dim_ordered/dim_ordered.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/optional.h"
@@ -18,35 +18,8 @@ DimOrdered<Idx, T> nonoverloaded_slice(DimOrdered<Idx, T> const &d,
   };
 
   return DimOrdered<Idx, T>{
-      subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
+      slice(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
 }
-
-template <typename T>
-FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
-                                          std::optional<ff_dim_t> const &start,
-                                          std::optional<ff_dim_t> const &end) {
-  auto to_raw_idx =
-      [](std::optional<ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(
-        idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
-  };
-
-  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
-}
-
-template <typename T>
-FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
-    FFOrdered<T> const &d,
-    std::optional<relative_ff_dim_t> const &start,
-    std::optional<relative_ff_dim_t> const &end) {
-  auto to_raw_idx =
-      [](std::optional<relative_ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(idx, [](relative_ff_dim_t const &i) { return i.value; });
-  };
-
-  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
-}
-
 template <typename Idx, typename T>
 DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
                          std::optional<Idx> const &start = std::nullopt,
@@ -54,20 +27,6 @@ DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
   return ff_dim_t_nonoverloaded_slice(d, start, end);
 }
 
-template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<ff_dim_t> const &start = std::nullopt,
-                   std::optional<ff_dim_t> const &end = std::nullopt) {
-  return ff_dim_t_nonoverloaded_slice(d, start, end);
-}
-
-template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<relative_ff_dim_t> const &start = std::nullopt,
-                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
-  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h
similarity index 95%
rename from lib/op-attrs/include/op-attrs/dim_ordered/concat.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/concat.h
index 9b9eaf9b93..a5faed2b36 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/concat_vectors.h"
 #include "utils/containers/transform.h"
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
similarity index 95%
rename from lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
index 9e4271a1ff..bc8636615c 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/bidict/bidict.h"
 #include "utils/containers/count.h"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
new file mode 100644
index 0000000000..92ed211c31
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
@@ -0,0 +1,228 @@
+#ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H
+#define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H
+
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/relative_ff_dim_t.dtg.h"
+#include "utils/fmt/vector.h"
+#include "utils/stack_vector/stack_vector.h"
+
+namespace FlexFlow {
+
+template <typename T>
+struct FFOrdered {
+  FFOrdered() {}
+
+  FFOrdered(std::initializer_list<T> const &l) : contents(l.begin(), l.end()) {}
+
+  FFOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  FFOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  FFOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(ff_dim_t idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T const &at(relative_ff_dim_t idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T &at(ff_dim_t idx) {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T &at(relative_ff_dim_t idx) {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T const &operator[](relative_ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  T &operator[](relative_ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(ff_dim_t const &idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return raw < this->contents.size();
+  }
+
+  bool idx_is_valid(relative_ff_dim_t const &idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return (raw >= 0 && raw < this->contents.size());
+  }
+
+  bool operator==(FFOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(FFOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.rend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<FFOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
+template <typename T>
+auto operator<(FFOrdered<T> const &lhs, FFOrdered<T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
+
+template <typename T>
+std::string format_as(FFOrdered<T> const &v) {
+  std::vector<T> as_vec(v.cbegin(), v.cend());
+  return fmt::format("<ff_ordered {}>", as_vec);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
+  return (s << fmt::to_string(v));
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <typename T>
+struct adl_serializer<::FlexFlow::FFOrdered<T>> {
+  static ::FlexFlow::FFOrdered<T> from_json(nlohmann::json const &j) {
+    return {j.template get<std::vector<T>>()};
+  }
+
+  static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered<T> const &x) {
+    j = std::vector<T>{x.cbegin(), x.cend()};
+  }
+};
+} // namespace nlohmann
+
+namespace std {
+
+template <typename T>
+struct hash<::FlexFlow::FFOrdered<T>> {
+  size_t operator()(::FlexFlow::FFOrdered<T> const &t) const {
+    static_assert(::FlexFlow::is_hashable<T>::value,
+                  "Elements must be hashable");
+
+    return get_std_hash(t.contents);
+  }
+};
+
+} // namespace std
+
+namespace rc {
+
+template <typename T>
+struct Arbitrary<::FlexFlow::FFOrdered<T>> {
+  static Gen<::FlexFlow::FFOrdered<T>> arbitrary() {
+    return gen::construct<::FlexFlow::FFOrdered<T>>(
+        gen::arbitrary<::FlexFlow::stack_vector<T, MAX_TENSOR_DIM>>());
+  }
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
similarity index 88%
rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
index f8f49233ec..9232afddfb 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
 #include "op-attrs/ff_dim_t.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
similarity index 88%
rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
index 8cc1bf3a51..ace60b7e3d 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
similarity index 91%
rename from lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
index 4e7f8530a4..5ff390d3fe 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "op-attrs/ff_dim_t.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
new file mode 100644
index 0000000000..79217c4cc3
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/slice.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename T>
+FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
+                                          ff_dim_t const &start,
+                                          std::optional<ff_dim_t> const &end) {
+  int raw_start = start.value.unwrap_nonnegative();
+  std::optional<int> raw_end = transform(
+      end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
+  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+template <typename T>
+FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &d,
+    relative_ff_dim_t const &start,
+    std::optional<relative_ff_dim_t> const &end) {
+  int raw_start = start.value;
+  std::optional<int> raw_end =
+      transform(end, [](relative_ff_dim_t const &i) { return i.value; });
+
+  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   ff_dim_t const &start = ff_dim_t{0_n},
+                   std::optional<ff_dim_t> const &end = std::nullopt) {
+  return ff_dim_t_nonoverloaded_slice(d, start, end);
+}
+
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   relative_ff_dim_t const &start = relative_ff_dim_t{0},
+                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
+  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
new file mode 100644
index 0000000000..3a8eeb9ecf
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/vector_transform.h"
+
+namespace FlexFlow {
+
+template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
+FFOrdered<Out> transform(FFOrdered<T> const &d, F &&f) {
+  return FFOrdered<Out>{vector_transform(vector_of(d), f)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
new file mode 100644
index 0000000000..fe207740f7
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/zip.h"
+
+namespace FlexFlow {
+
+template <typename T1, typename T2>
+FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &lhs,
+                                 FFOrdered<T2> const &rhs) {
+  return FFOrdered<std::pair<T1, T2>>{zip(vector_of(lhs), vector_of(rhs))};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
index b1c5f60382..50756f095b 100644
--- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
index be3a95eec8..d68ef02ec1 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/parallel_tensor_shape/sum_degree.dtg.h",
   "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h",
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "utils/nonnegative_int/nonnegative_int.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
index f24fa12309..d2f8758377 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/replica_parallel_dim_set.dtg.h",
   "<unordered_map>",
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index 97f3432c2f..ba35295e09 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -19,7 +19,7 @@ std::optional<TensorDims>
     get_broadcast_target_dims(std::unordered_set<TensorDims> const &);
 
 TensorDims slice_tensor_dims(TensorDims const &,
-                             std::optional<relative_ff_dim_t> const &start,
+                             relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
index e86b866fd6..8c6d1098cc 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "utils/nonnegative_int/nonnegative_int.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index a3cd8bfd9a..298ea04638 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -12,7 +12,7 @@ nonnegative_int get_num_elements(TensorShape const &);
 nonnegative_int get_size_in_bytes(TensorShape const &);
 
 TensorShape slice_tensor_shape(TensorShape const &,
-                               std::optional<relative_ff_dim_t> const &start,
+                               relative_ff_dim_t const &start,
                                std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..4604ef0b4e
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -0,0 +1,25 @@
+#include "op-attrs/datatype_value.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_double_data_type_value(double value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int32_data_type_value(int32_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int64_data_type_value(int64_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_bool_data_type_value(bool value) {
+  return DataTypeValue{value};
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
deleted file mode 100644
index cb29f708a3..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/concat.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
deleted file mode 100644
index 6edd5485af..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/enumerate.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
deleted file mode 100644
index 2de88f38c8..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
deleted file mode 100644
index 8e5c2fd38a..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
deleted file mode 100644
index 175ae8d4bd..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/get_idxs.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
index 75ab1a32aa..8c3dbd7bbc 100644
--- a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
@@ -1,26 +1 @@
 #include "op-attrs/dim_ordered/slice.h"
-#include "utils/archetypes/value_type.h"
-
-namespace FlexFlow {
-
-using T = value_type<0>;
-
-template FFOrdered<T>
-    ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
-                                 std::optional<ff_dim_t> const &start,
-                                 std::optional<ff_dim_t> const &end);
-
-template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
-    FFOrdered<T> const &d,
-    std::optional<relative_ff_dim_t> const &start,
-    std::optional<relative_ff_dim_t> const &end);
-
-template FFOrdered<T> slice(FFOrdered<T> const &d,
-                            std::optional<ff_dim_t> const &start,
-                            std::optional<ff_dim_t> const &end);
-
-template FFOrdered<T> slice(FFOrdered<T> const &d,
-                            std::optional<relative_ff_dim_t> const &start,
-                            std::optional<relative_ff_dim_t> const &end);
-
-} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
new file mode 100644
index 0000000000..73683eba94
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
@@ -0,0 +1 @@
+#include "op-attrs/dim_ordered/transform.h"
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
new file mode 100644
index 0000000000..e06c144149
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<ff_dim_t, T> enumerate(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
new file mode 100644
index 0000000000..1420586809
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
@@ -0,0 +1,14 @@
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct FFOrdered<T>;
+
+template std::string format_as(FFOrdered<T> const &);
+
+template std::ostream &operator<<(std::ostream &, FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
new file mode 100644
index 0000000000..e39fedb858
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -0,0 +1,13 @@
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> ff_ordered_from_map(std::map<ff_dim_t, T> const &);
+
+template FFOrdered<T>
+    ff_ordered_from_map(std::unordered_map<ff_dim_t, T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
new file mode 100644
index 0000000000..3da15bebba
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::vector<ff_dim_t> get_idxs(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
new file mode 100644
index 0000000000..059fd811cd
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
@@ -0,0 +1,24 @@
+#include "op-attrs/ff_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &, ff_dim_t const &, std::optional<ff_dim_t> const &);
+
+template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &,
+    relative_ff_dim_t const &,
+    std::optional<relative_ff_dim_t> const &);
+
+template FFOrdered<T> slice(FFOrdered<T> const &,
+                            ff_dim_t const &,
+                            std::optional<ff_dim_t> const &);
+
+template FFOrdered<T> slice(FFOrdered<T> const &,
+                            relative_ff_dim_t const &,
+                            std::optional<relative_ff_dim_t> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
new file mode 100644
index 0000000000..74bf4895a3
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/transform.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<Out(T const &)>;
+
+template FFOrdered<Out> transform(FFOrdered<T> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
new file mode 100644
index 0000000000..dc715ea97c
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/zip.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T1 = value_type<0>;
+using T2 = value_type<1>;
+
+template FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &,
+                                          FFOrdered<T2> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index d4763ef004..ddd92bd417 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/batch_norm.h"
-#include "op-attrs/dim_ordered/concat.h"
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/any_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index fc42241ef2..bf0ba553e4 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/concat.h"
-#include "op-attrs/dim_ordered/enumerate.h"
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index 4dc602646b..5b5b91a8e7 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -1,8 +1,10 @@
 #include "op-attrs/ops/embedding.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/transform.h"
+#include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_dims.h"
 #include "utils/containers/product.h"
+#include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index 8ed12167b3..b4eeda76ab 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/flat.h"
-#include "op-attrs/dim_ordered/concat.h"
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_dims.h"
 #include "utils/containers/any_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 00c6bb5e9b..c9798368e2 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/layer_norm.h"
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
-#include "op-attrs/dim_ordered/get_idxs.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/all_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index fb26113613..bee9d0cf4f 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -1,11 +1,12 @@
 #include "op-attrs/ops/linear.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/transform.h"
 #include "op-attrs/initializers/kaiming_initializer_mode.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/product.h"
 #include "utils/expected.h"
+#include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
@@ -101,7 +102,7 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
-                                            std::nullopt,
+                                            relative_ff_dim_t{0},
                                             relative_ff_dim_t{-1}))};
   FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{
       shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
@@ -126,8 +127,10 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree =
       SumDegree{get_sum_degree(input) *
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice(
-      ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))};
+  DiscardCopyDegree discard_copy_degree =
+      DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input),
+                                      relative_ff_dim_t{0},
+                                      relative_ff_dim_t{-1}))};
   FFOrdered<nonnegative_int> shard_degrees =
       FFOrdered<nonnegative_int>{get_discard_copy_degree(input)};
 
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 7a8f91e498..3f2245b2dc 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/parallel_tensor_dims.h"
-#include "op-attrs/dim_ordered/transform.h"
-#include "op-attrs/dim_ordered/zip.h"
+#include "op-attrs/ff_ordered/transform.h"
+#include "op-attrs/ff_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.h"
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index 8d0592eab7..760278297c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/tensor_dims.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/zip.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.dtg.h"
 #include "utils/containers/all_of.h"
@@ -67,7 +67,7 @@ std::optional<TensorDims>
 }
 
 TensorDims slice_tensor_dims(TensorDims const &dims,
-                             std::optional<relative_ff_dim_t> const &start,
+                             relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop) {
   return TensorDims{
       slice(dims.ff_ordered, start, stop),
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 04b18794f1..afc14af54c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -29,7 +29,7 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) {
 }
 
 TensorShape slice_tensor_shape(TensorShape const &shape,
-                               std::optional<relative_ff_dim_t> const &start,
+                               relative_ff_dim_t const &start,
                                std::optional<relative_ff_dim_t> const &stop) {
   return TensorShape{
       slice_tensor_dims(shape.dims, start, stop),
diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..9b0e90b601
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
@@ -0,0 +1,68 @@
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("test make_data_type_value") {
+    SUBCASE("make_float_data_type_value") {
+      float value = 1.0f;
+      DataTypeValue data_type_value = make_float_data_type_value(value);
+
+      CHECK(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<float>() == value);
+    }
+
+    SUBCASE("make_double_data_type_value") {
+      double value = 2.71828;
+      DataTypeValue data_type_value = make_double_data_type_value(value);
+
+      CHECK(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<double>() == value);
+    }
+
+    SUBCASE("make_int32_data_type_value") {
+      int32_t value = -42;
+      DataTypeValue data_type_value = make_int32_data_type_value(value);
+
+      CHECK(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int32_t>() == value);
+    }
+
+    SUBCASE("make_int64_data_type_value") {
+      int64_t value = 1LL << 40;
+      DataTypeValue data_type_value = make_int64_data_type_value(value);
+
+      CHECK(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int64_t>() == value);
+    }
+
+    SUBCASE("make_bool_data_type_value") {
+      bool value = true;
+      DataTypeValue data_type_value = make_bool_data_type_value(value);
+
+      CHECK(data_type_value.has<bool>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK(data_type_value.get<bool>() == value);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
index d7901a0c53..a5a261da25 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
@@ -10,8 +10,4 @@ TEST_SUITE(FF_TEST_SUITE) {
       "Arbitrary<DimOrdered<int, T>> with T=", T, int, double, char) {
     RC_SUBCASE([](DimOrdered<int, T>) {});
   }
-
-  TEST_CASE_TEMPLATE("Arbitrary<FFOrdered<T>> with T=", T, int, double, char) {
-    RC_SUBCASE([](FFOrdered<T>) {});
-  }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
similarity index 97%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
index 2ac641cfc2..d8e04124bc 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/concat.h"
+#include "op-attrs/ff_ordered/concat.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
similarity index 92%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
index bf4c33d65a..e1a94e72c3 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/enumerate.h"
 #include "test/utils/doctest/fmt/map.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
new file mode 100644
index 0000000000..b0812ba9d6
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
@@ -0,0 +1,11 @@
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE_TEMPLATE("Arbitrary<FFOrdered<T>> with T=", T, int, double, char) {
+    RC_SUBCASE([](FFOrdered<T>) {});
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
similarity index 96%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
index bba989920e..73036d5662 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
similarity index 79%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
index b2fddd058e..2f1dfecd65 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -25,13 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       CHECK(result == correct);
     }
-    SUBCASE("std::nullopt_t, ff_dim_t") {
-      FFOrdered<size_t> result =
-          slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}});
-      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
-
-      CHECK(result == correct);
-    }
     SUBCASE("relative_ff_dim_t, relative_ff_dim_t") {
       FFOrdered<size_t> result =
           slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1});
@@ -45,12 +38,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       CHECK(result == correct);
     }
-    SUBCASE("std::nullopt_t, relative_ff_dim_t") {
-      FFOrdered<size_t> result = slice(d, std::nullopt, relative_ff_dim_t{-1});
-      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
-
-      CHECK(result == correct);
-    }
     SUBCASE("start index = stop index") {
       FFOrdered<size_t> result =
           slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1});
@@ -86,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt));
     }
     SUBCASE("stop index out of bounds (too low)") {
-      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10}));
+      CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{-10}));
     }
     SUBCASE("stop index out of bounds (too high)") {
-      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10}));
+      CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{10}));
     }
   }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
new file mode 100644
index 0000000000..4bf189ec77
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
@@ -0,0 +1,35 @@
+#include "op-attrs/ff_ordered/transform.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("transform(FFOrdered<T>, F)") {
+    SUBCASE("input is empty") {
+      FFOrdered<std::string> input = {};
+
+      FFOrdered<int> result = transform(input, [](std::string const &) -> int {
+        CHECK(false);
+        return 0;
+      });
+      FFOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is not empty") {
+      FFOrdered<int> input = {2, 1, 2, 5};
+
+      FFOrdered<std::string> result =
+          transform(input, [](int x) { return fmt::to_string(x); });
+      FFOrdered<std::string> correct = FFOrdered<std::string>{
+          "2",
+          "1",
+          "2",
+          "5",
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
new file mode 100644
index 0000000000..19167cd0ff
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
@@ -0,0 +1,38 @@
+#include "op-attrs/ff_ordered/zip.h"
+#include "test/utils/doctest/fmt/pair.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("zip(FFOrdered<T1>, FFOrdered<T2>)") {
+    FFOrdered<int> lhs_input = {9, 9, 8, 9};
+    FFOrdered<std::string> rhs_input = {"m", "m", "k", "l", "m"};
+
+    SUBCASE("lhs is longer") {
+      FFOrdered<std::pair<int, std::string>> result = zip(lhs_input, rhs_input);
+
+      FFOrdered<std::pair<int, std::string>> correct = {
+          {9, "m"},
+          {9, "m"},
+          {8, "k"},
+          {9, "l"},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("rhs is longer") {
+      FFOrdered<std::pair<std::string, int>> result = zip(rhs_input, lhs_input);
+
+      FFOrdered<std::pair<std::string, int>> correct = {
+          {"m", 9},
+          {"m", 9},
+          {"k", 8},
+          {"l", 9},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml
new file mode 100644
index 0000000000..ebb2323203
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.enum.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "Metric"
+features = [
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+[[values]]
+name = "ACCURACY"
+
+[[values]]
+name = "CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "SPARSE_CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "ROOT_MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "MEAN_ABSOLUTE_ERROR"
diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h
new file mode 100644
index 0000000000..343c2154dd
--- /dev/null
+++ b/lib/pcg/include/pcg/metric_attrs.h
@@ -0,0 +1,28 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "pcg/metric.dtg.h"
+#include "utils/fmt.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::unordered_set<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc
new file mode 100644
index 0000000000..9a93e75350
--- /dev/null
+++ b/lib/pcg/src/pcg/metric_attrs.cc
@@ -0,0 +1,38 @@
+#include "pcg/metric_attrs.h"
+
+namespace FlexFlow {
+MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
+                           std::unordered_set<Metric> const &metrics)
+    : loss_type(_loss_type), measure_accuracy(false),
+      measure_categorical_crossentropy(false),
+      measure_sparse_categorical_crossentropy(false),
+      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
+      measure_mean_absolute_error(false) {
+  for (Metric const &m : metrics) {
+    switch (m) {
+      case Metric::ACCURACY:
+        measure_accuracy = true;
+        continue;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        measure_categorical_crossentropy = true;
+        continue;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        measure_sparse_categorical_crossentropy = true;
+        continue;
+      case Metric::MEAN_SQUARED_ERROR:
+        measure_mean_squared_error = true;
+        continue;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        measure_root_mean_squared_error = true;
+        continue;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        measure_mean_absolute_error = true;
+        continue;
+      default:
+        throw mk_runtime_error(fmt::format(
+            "Initializing MetricsAttrs with unrecogonized metrics type {}", m));
+    }
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
index 2cf149f78a..940024c9b6 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
@@ -1,5 +1,5 @@
 #include "pcg/parallel_computation_graph/generate_weight_transform.h"
-#include "op-attrs/dim_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/enumerate.h"
 #include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc
index feb6e704b2..33e15baed2 100644
--- a/lib/runtime/src/metrics_functions.cc
+++ b/lib/runtime/src/metrics_functions.cc
@@ -25,39 +25,6 @@ namespace FlexFlow {
 
 LegionRuntime::Logger::Category log_metrics("metrics");
 
-MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                           std::vector<Metric> const &metrics)
-    : loss_type(_loss_type), measure_accuracy(false),
-      measure_categorical_crossentropy(false),
-      measure_sparse_categorical_crossentropy(false),
-      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
-      measure_mean_absolute_error(false) {
-  for (Metric const &m : metrics) {
-    switch (m) {
-      case Metric::ACCURACY:
-        measure_accuracy = true;
-        continue;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        measure_categorical_crossentropy = true;
-        continue;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        measure_sparse_categorical_crossentropy = true;
-        continue;
-      case Metric::MEAN_SQUARED_ERROR:
-        measure_mean_squared_error = true;
-        continue;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        measure_root_mean_squared_error = true;
-        continue;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        measure_mean_absolute_error = true;
-        continue;
-      default:
-        throw mk_runtime_error("Unrecogonized metrics type {}", m);
-    }
-  }
-}
-
 enum Slots {
   LOGIT,
   LABEL,
diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h
index fbb0b633bf..73dc3bbc51 100644
--- a/lib/runtime/src/metrics_functions.h
+++ b/lib/runtime/src/metrics_functions.h
@@ -16,38 +16,13 @@
 #ifndef _FF_METRICS_FUNCTIONS_H_
 #define _FF_METRICS_FUNCTIONS_H_
 
+#include "kernels/metric.h"
 #include "kernels/perf_metrics.h"
 #include "legion.h"
-#include "op-attrs/ops/loss_functions.h"
 #include "task_spec/task_invocation.h"
-#include "utils/fmt.h"
 
 namespace FlexFlow {
 
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
 TypedIndexTaskInvocation<PerfMetrics>
     compute_metrics(MetricsAttrs const &,
                     parallel_tensor_guid_t const &logit,
@@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs,
                  measure_root_mean_squared_error,
                  measure_mean_absolute_error);
 
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
 #endif
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
index 253fd3cb4f..83e7c15460 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/runtime/src/ops/embedding.cc
@@ -77,11 +77,11 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Embedding] backward_time = {:.2lf}ms\n",
-                 input,
                  output,
+                 input,
                  weight_grad,
-                 input.data_type,
                  output.data_type,
+                 input.data_type,
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
diff --git a/lib/utils/include/utils/containers/subvec.h b/lib/utils/include/utils/containers/slice.h
similarity index 69%
rename from lib/utils/include/utils/containers/subvec.h
rename to lib/utils/include/utils/containers/slice.h
index c89e9227de..a82fb383b5 100644
--- a/lib/utils/include/utils/containers/subvec.h
+++ b/lib/utils/include/utils/containers/slice.h
@@ -9,9 +9,9 @@
 namespace FlexFlow {
 
 template <typename T>
-std::vector<T> subvec(std::vector<T> const &v,
-                      std::optional<int> const &maybe_start,
-                      std::optional<int> const &maybe_end) {
+std::vector<T> slice(std::vector<T> const &v,
+                     int const &maybe_start,
+                     std::optional<int> const &maybe_end) {
   auto begin_iter = v.cbegin();
   auto end_iter = v.cend();
 
@@ -22,15 +22,13 @@ std::vector<T> subvec(std::vector<T> const &v,
         if (idx < 0) {
           new_idx = size + idx;
         }
-        if (new_idx < 0 || new_idx > size) {
-          throw mk_runtime_error("Index {} is out of bounds for array {}");
-        }
+
+        ASSERT(new_idx >= 0, "Index out of bounds");
+        ASSERT(new_idx <= size, "Index out of bounds");
         return new_idx;
       };
 
-  if (maybe_start.has_value()) {
-    begin_iter += resolve_loc(maybe_start.value());
-  }
+  begin_iter += resolve_loc(maybe_start);
 
   if (maybe_end.has_value()) {
     end_iter = v.cbegin() + resolve_loc(maybe_end.value());
diff --git a/lib/utils/include/utils/containers/zip_strict.h b/lib/utils/include/utils/containers/zip_strict.h
index 64049042d4..5606fccff1 100644
--- a/lib/utils/include/utils/containers/zip_strict.h
+++ b/lib/utils/include/utils/containers/zip_strict.h
@@ -4,21 +4,17 @@
 #include "utils/containers/zip.h"
 #include "utils/exception.h"
 #include "utils/fmt/vector.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
 template <typename L, typename R>
 std::vector<std::pair<L, R>> zip_strict(std::vector<L> const &lhs,
                                         std::vector<R> const &rhs) {
-  if (lhs.size() != rhs.size()) {
-    throw mk_runtime_error(
-        fmt::format("zip_strict requires lhs and rhs to have the same length, "
-                    "but received lhs={} (length {}), rhs={} (length {})",
-                    lhs,
-                    lhs.size(),
-                    rhs,
-                    rhs.size()));
-  }
+  ASSERT(lhs.size() == rhs.size(),
+         "zip_strict requires lhs and rhs to have the same length",
+         lhs,
+         rhs);
 
   return zip(lhs, rhs);
 }
diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h
index 080cbb3611..f95eb8a38d 100644
--- a/lib/utils/include/utils/exception.h
+++ b/lib/utils/include/utils/exception.h
@@ -3,6 +3,7 @@
 
 #include "utils/fmt.h"
 #include <fmt/format.h>
+#include <libassert/assert.hpp>
 #include <stdexcept>
 #include <tl/expected.hpp>
 
diff --git a/lib/utils/include/utils/indent.h b/lib/utils/include/utils/indent.h
new file mode 100644
index 0000000000..eccbd34cfc
--- /dev/null
+++ b/lib/utils/include/utils/indent.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H
+
+#include <string>
+
+namespace FlexFlow {
+
+std::string indent(std::string const &, int indent_size = 2);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/stack_vector/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h
index 5d4d6eaad3..64d005a10e 100644
--- a/lib/utils/include/utils/stack_vector/stack_vector.h
+++ b/lib/utils/include/utils/stack_vector/stack_vector.h
@@ -272,18 +272,6 @@ struct stack_vector {
     return !(*this == other);
   }
 
-  bool operator<(stack_vector const &other) const {
-    for (std::size_t i = 0; i < std::min(this->m_size, other.m_size); i++) {
-      if (this->at(i) < other.at(i)) {
-        return true;
-      } else if (this->at(i) > other.at(i)) {
-        return false;
-      }
-    }
-
-    return (this->m_size < other.m_size);
-  }
-
   std::size_t size() const {
     return this->m_size;
   }
@@ -305,17 +293,16 @@ struct stack_vector {
 private:
   std::size_t m_size = 0;
   std::array<element_type, MAXSIZE> contents;
-
-  static_assert(
-      implies<is_equal_comparable<T>, is_equal_comparable<stack_vector>>::value,
-      "");
-  static_assert(
-      implies<is_neq_comparable<T>, is_neq_comparable<stack_vector>>::value,
-      "");
-  static_assert(
-      implies<is_lt_comparable<T>, is_lt_comparable<stack_vector>>::value, "");
 };
 
+template <typename T, std::size_t MAXSIZE>
+auto operator<(stack_vector<T, MAXSIZE> const &lhs,
+               stack_vector<T, MAXSIZE> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
 template <typename T, std::size_t MAXSIZE>
 std::ostream &operator<<(std::ostream &s, stack_vector<T, MAXSIZE> const &v) {
   return s << fmt::to_string(v);
diff --git a/lib/utils/src/utils/containers/slice.cc b/lib/utils/src/utils/containers/slice.cc
new file mode 100644
index 0000000000..f960c21881
--- /dev/null
+++ b/lib/utils/src/utils/containers/slice.cc
@@ -0,0 +1,3 @@
+#include "utils/containers/slice.h"
+
+namespace FlexFlow {} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/subvec.cc b/lib/utils/src/utils/containers/subvec.cc
deleted file mode 100644
index 93c7de31c5..0000000000
--- a/lib/utils/src/utils/containers/subvec.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "utils/containers/subvec.h"
diff --git a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
index 8445a2721a..8aed06ae01 100644
--- a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
+++ b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
@@ -1,5 +1,5 @@
 #include "utils/full_binary_tree/binary_tree_path.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 
 namespace FlexFlow {
 
@@ -27,7 +27,7 @@ BinaryTreePathEntry binary_tree_path_get_top_level(BinaryTreePath const &p) {
 
 BinaryTreePath binary_tree_path_get_non_top_level(BinaryTreePath const &p) {
   return BinaryTreePath{
-      subvec(p.entries, 1, std::nullopt),
+      slice(p.entries, 1, std::nullopt),
   };
 }
 
diff --git a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
index 5b9b592444..459e61be71 100644
--- a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
+++ b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
@@ -3,7 +3,7 @@
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/require_same.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
 #include "utils/graph/digraph/algorithms/get_predecessors.h"
@@ -103,7 +103,7 @@ MultiDiEdge
   Node last = g.get_multidiedge_dst(reduction.edges.back());
 
   std::vector<Node> internal_nodes;
-  for (MultiDiEdge const &e : subvec(reduction.edges, std::nullopt, -1)) {
+  for (MultiDiEdge const &e : slice(reduction.edges, 0, -1)) {
     internal_nodes.push_back(g.get_multidiedge_dst(e));
   }
 
diff --git a/lib/utils/src/utils/indent.cc b/lib/utils/src/utils/indent.cc
new file mode 100644
index 0000000000..2761ad1878
--- /dev/null
+++ b/lib/utils/src/utils/indent.cc
@@ -0,0 +1,17 @@
+#include "utils/indent.h"
+#include "utils/containers/flatmap.h"
+
+namespace FlexFlow {
+
+std::string indent(std::string const &s, int indent_size) {
+  std::string indent_str(indent_size, ' ');
+  return indent_str + flatmap(s, [&](char c) -> std::string {
+           if (c == '\n') {
+             return "\n" + indent_str;
+           } else {
+             return std::string{c};
+           };
+         });
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc
index d4fb849412..e2009d74d3 100644
--- a/lib/utils/src/utils/stack_vector/stack_vector.cc
+++ b/lib/utils/src/utils/stack_vector/stack_vector.cc
@@ -1,9 +1,9 @@
 #include "utils/stack_vector/stack_vector.h"
-#include "utils/archetypes/ordered_value_type.h"
+#include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
-using T = ordered_value_type<0>;
+using T = value_type<0>;
 
 template struct stack_vector<T, 5>;
 template struct stack_vector<int, 5>;
diff --git a/lib/utils/test/common/include/test/utils/doctest/check_kv.h b/lib/utils/test/common/include/test/utils/doctest/check_kv.h
new file mode 100644
index 0000000000..6449b8ac87
--- /dev/null
+++ b/lib/utils/test/common/include/test/utils/doctest/check_kv.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H
+#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H
+
+#include <string>
+
+namespace FlexFlow {
+
+std::string check_kv(std::string const &k, std::string const &v);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc
index 9522fa7fdb..6df2d925b7 100644
--- a/lib/utils/test/common/src/main.cc
+++ b/lib/utils/test/common/src/main.cc
@@ -1,2 +1,15 @@
-#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include "doctest/doctest.h"
+#define DOCTEST_CONFIG_IMPLEMENT
+#include <doctest/doctest.h>
+
+#include <libassert/assert.hpp>
+#include <stdexcept>
+
+void libassert_throw_exception_handler(libassert::assertion_info const &info) {
+  throw std::runtime_error("Assertion failed:\n" + info.to_string());
+}
+
+int main(int argc, char **argv) {
+  libassert::set_failure_handler(libassert_throw_exception_handler);
+
+  return doctest::Context(argc, argv).run();
+}
diff --git a/lib/utils/test/common/src/test/utils/doctest/check_kv.cc b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc
new file mode 100644
index 0000000000..d3c1ee335e
--- /dev/null
+++ b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc
@@ -0,0 +1,17 @@
+#include "test/utils/doctest/check_kv.h"
+#include "utils/indent.h"
+#include <sstream>
+
+namespace FlexFlow {
+
+std::string check_kv(std::string const &k, std::string const &v) {
+  std::ostringstream oss;
+
+  oss << std::endl
+      << indent(k + "=", /*indent_size=*/4) << std::endl
+      << indent(v, /*indent_size=*/6);
+
+  return oss.str();
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/subvec.cc b/lib/utils/test/src/utils/containers/slice.cc
similarity index 69%
rename from lib/utils/test/src/utils/containers/subvec.cc
rename to lib/utils/test/src/utils/containers/slice.cc
index 610fc55b5a..4e4d840bfe 100644
--- a/lib/utils/test/src/utils/containers/subvec.cc
+++ b/lib/utils/test/src/utils/containers/slice.cc
@@ -1,4 +1,4 @@
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include <doctest/doctest.h>
 #include <vector>
@@ -6,57 +6,57 @@
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("subvec") {
+  TEST_CASE("slice") {
     std::vector<int> v = {1, 2, 3, 4, 5};
 
-    SUBCASE("Basic subvector") {
-      auto result = subvec(v, 1, 4);
+    SUBCASE("Basic slice") {
+      auto result = slice(v, 1, 4);
       std::vector<int> correct = {2, 3, 4};
       CHECK(result == correct);
     }
 
     SUBCASE("From beginning to index") {
-      auto result = subvec(v, std::nullopt, 3);
+      auto result = slice(v, 0, 3);
       std::vector<int> correct = {1, 2, 3};
       CHECK(result == correct);
     }
 
     SUBCASE("From index to end") {
-      auto result = subvec(v, 2, std::nullopt);
+      auto result = slice(v, 2, std::nullopt);
       std::vector<int> correct = {3, 4, 5};
       CHECK(result == correct);
     }
 
     SUBCASE("All of the vector") {
-      auto result = subvec(v, std::nullopt, std::nullopt);
+      auto result = slice(v, 0, std::nullopt);
       std::vector<int> correct = {1, 2, 3, 4, 5};
       CHECK(result == correct);
     }
 
     SUBCASE("Start greater than end") {
-      auto result = subvec(v, 3, 1);
+      auto result = slice(v, 3, 1);
       std::vector<int> correct = {};
       CHECK(result == correct);
     }
 
     SUBCASE("Start equal to end") {
-      auto result = subvec(v, 3, 3);
+      auto result = slice(v, 3, 3);
       std::vector<int> correct = {};
       CHECK(result == correct);
     }
 
     SUBCASE("Negative indices") {
-      auto result = subvec(v, -3, -1);
+      auto result = slice(v, -3, -1);
       std::vector<int> correct = {3, 4};
       CHECK(result == correct);
     }
 
     SUBCASE("Upper index is out of bounds by 1") {
-      CHECK_THROWS(subvec(v, 2, 6));
+      CHECK_THROWS(slice(v, 2, 6));
     }
 
     SUBCASE("Lower index is out of bounds by 1") {
-      CHECK_THROWS(subvec(v, -6, 2));
+      CHECK_THROWS(slice(v, -6, 2));
     }
   }
 }
diff --git a/lib/utils/test/src/utils/indent.cc b/lib/utils/test/src/utils/indent.cc
new file mode 100644
index 0000000000..b137253fae
--- /dev/null
+++ b/lib/utils/test/src/utils/indent.cc
@@ -0,0 +1,66 @@
+#include "utils/indent.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("indent") {
+    SUBCASE("string is empty") {
+      std::string input = "";
+
+      std::string result = indent(input);
+      std::string correct = "  ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("string is one line") {
+      std::string input = "hello world";
+      std::string result = indent(input);
+      std::string correct = "  hello world";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("string has multiple lines") {
+      std::string input = "\n"
+                          "a b\n"
+                          "c d\n"
+                          "e f\n"
+                          "g\n";
+
+      std::string result = indent(input);
+      std::string correct = "  \n"
+                            "  a b\n"
+                            "  c d\n"
+                            "  e f\n"
+                            "  g\n"
+                            "  ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("leading and trailing whitespace is preserved") {
+      std::string input = "   a b  \n"
+                          "c   d e\n"
+                          "     ";
+
+      std::string result = indent(input);
+      std::string correct = "     a b  \n"
+                            "  c   d e\n"
+                            "       ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("allows custom indent size") {
+      std::string input = "hello\nworld";
+
+      std::string result = indent(input, /*indent_size=*/4);
+      std::string correct = "    hello\n"
+                            "    world";
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
index c36de733b6..6eb2cc0d88 100644
--- a/lib/utils/test/src/utils/stack_vector/stack_vector.cc
+++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
@@ -1,12 +1,97 @@
 #include "utils/stack_vector/stack_vector.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include "test/utils/rapidcheck.h"
+#include "utils/archetypes/value_type.h"
 #include <doctest/doctest.h>
 #include <iterator>
 
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("operator<(stack_vector<T, MAXSIZE>, stack_vector<T, MAXSIZE>)") {
+    constexpr std::size_t MAXSIZE = 5;
+
+    SUBCASE("T is ordered") {
+      SUBCASE("inputs are the same") {
+        std::vector<int> input = {2, 1, 2, 3};
+
+        bool result = (input < input);
+        bool correct = false;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs is strict prefix of rhs") {
+        std::vector<int> lhs = {2, 1, 2};
+        std::vector<int> rhs = {2, 1, 2, 3};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs is empty") {
+        std::vector<int> lhs = {};
+        std::vector<int> rhs = {2, 1, 2, 3};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs has a smaller element first") {
+        std::vector<int> lhs = {2, 1, 0, 3};
+        std::vector<int> rhs = {2, 1, 2};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      // from the definition of a strict total order, i.e.,
+      // https://en.wikipedia.org/w/index.php?title=Total_order&oldid=1278541072#Strict_and_non-strict_total_orders
+      RC_SUBCASE("operator< is irreflexive",
+                 [](stack_vector<int, MAXSIZE> const &input) {
+                   RC_ASSERT(!(input < input));
+                 });
+
+      RC_SUBCASE("operator< is asymmetric",
+                 [](stack_vector<int, MAXSIZE> const &lhs,
+                    stack_vector<int, MAXSIZE> const &rhs) {
+                   RC_PRE(lhs != rhs);
+
+                   RC_ASSERT((lhs < rhs) == !(rhs < lhs));
+                 });
+
+      RC_SUBCASE("operator< is transitive",
+                 [](stack_vector<int, MAXSIZE> const &a,
+                    stack_vector<int, MAXSIZE> const &b,
+                    stack_vector<int, MAXSIZE> const &c) {
+                   RC_PRE(a < b);
+                   RC_PRE(b < c);
+
+                   RC_ASSERT(a < c);
+                 });
+
+      RC_SUBCASE("operator< is connected",
+                 [](stack_vector<int, MAXSIZE> const &lhs,
+                    stack_vector<int, MAXSIZE> const &rhs) {
+                   RC_PRE(lhs != rhs);
+
+                   RC_ASSERT((lhs < rhs) || (rhs < lhs));
+                 });
+    }
+
+    SUBCASE("T is not ordered") {
+      bool result = is_lt_comparable_v<stack_vector<value_type<0>, MAXSIZE>>;
+
+      CHECK_FALSE(result);
+    }
+  }
+
   TEST_CASE_TEMPLATE(
       "stack_vector<T, MAXSIZE>::push_back", T, int, double, char) {
     constexpr std::size_t MAXSIZE = 5;