From 2dc4c60531f233b5cd364107765a907d1567a996 Mon Sep 17 00:00:00 2001
From: reyna-abhyankar <forvirenra@gmail.com>
Date: Fri, 10 May 2024 10:44:10 -0700
Subject: [PATCH 01/24] Add allocators

---
 lib/CMakeLists.txt                            |  1 +
 lib/local-execution/CMakeLists.txt            | 15 +++++++++++
 lib/local-execution/include/local_allocator.h | 24 +++++++++++++++++
 .../include/tracked_allocator.h               | 21 +++++++++++++++
 lib/local-execution/src/local_allocator.cc    | 20 ++++++++++++++
 lib/local-execution/src/tracked_allocator.cc  | 27 +++++++++++++++++++
 6 files changed, 108 insertions(+)
 create mode 100644 lib/local-execution/CMakeLists.txt
 create mode 100644 lib/local-execution/include/local_allocator.h
 create mode 100644 lib/local-execution/include/tracked_allocator.h
 create mode 100644 lib/local-execution/src/local_allocator.cc
 create mode 100644 lib/local-execution/src/tracked_allocator.cc
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index f7c166f0dd..8ed5d87d86 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(compiler)
 add_subdirectory(runtime)
 add_subdirectory(op-attrs)
 add_subdirectory(kernels)
+add_subdirectory(local-execution)
 add_subdirectory(utils)
 add_subdirectory(ffi)
 add_subdirectory(substitutions)
diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt
new file mode 100644
index 0000000000..ee1d8fecdc
--- /dev/null
+++ b/lib/local-execution/CMakeLists.txt
@@ -0,0 +1,15 @@
+ff_add_library(
+  NAME
+    local-execution
+  SRC_PATTERNS
+    src/*.cc
+  PUBLIC_INCLUDE
+    include/
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    op-attrs
+    utils
+    kernels
+    pcg
+)
\ No newline at end of file
diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local_allocator.h
new file mode 100644
index 0000000000..f4b253b281
--- /dev/null
+++ b/lib/local-execution/include/local_allocator.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
+#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+struct LocalAllocator : public IAllocator {
+  LocalAllocator() = default;
+  LocalAllocator(LocalAllocator const &) = delete;
+  LocalAllocator(LocalAllocator &&) = delete;
+  ~LocalAllocator() = default;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalAllocator);
+
+Allocator get_local_memory_allocator();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h
new file mode 100644
index 0000000000..64cc31e858
--- /dev/null
+++ b/lib/local-execution/include/tracked_allocator.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+struct TrackedAllocator: public Allocator {
+  Allocator() = delete;
+
+  void *allocate(size_t mem_size);
+  void deallocate(void *ptr);
+  size_t get_current_mem_usage();
+
+private:
+  size_t current_mem_usage;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/src/local_allocator.cc b/lib/local-execution/src/local_allocator.cc
new file mode 100644
index 0000000000..0bb7d04574
--- /dev/null
+++ b/lib/local-execution/src/local_allocator.cc
@@ -0,0 +1,20 @@
+#include "local_allocator.h"
+#include "kernels/device.h"
+
+namespace FlexFlow {
+
+void *LocalAllocator::allocate(size_t requested_memory_size) {
+  void *ptr;
+  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
+  return ptr;
+}
+
+void LocalAllocator::deallocate(void *ptr) {
+  checkCUDA(cudaFree(ptr));
+}
+
+Allocator get_local_memory_allocator() {
+  return Allocator::create<LocalAllocator>();
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
new file mode 100644
index 0000000000..6e666b647c
--- /dev/null
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -0,0 +1,27 @@
+#include "tracked_allocator.h"
+#include "kernels/device.h"
+
+namespace FlexFlow {
+
+void *TrackedAllocator::allocate(size_t mem_size) {
+  void *ptr = this->i_allocator->allocate(mem_size);
+  this->curr_mem_usage += mem_size;
+  return ptr;
+}
+
+void TrackedAllocator::deallocate(void *ptr) {
+  size_t psize;
+  checkCUDA(cuMemGetAddressRange(nullptr, &psize, ptr));
+  this->i_allocator->deallocate(ptr);
+  this->curr_mem_usage -= psize;
+}
+
+size_t TrackedAllocator::get_current_mem_usage() {
+  return this->curr_mem_usage;
+}
+
+TrackedAllocator get_tracked_local_allocator() {
+  return Allocator::create<LocalAllocator>();
+}
+
+} // namespace FlexFlow

From 2488514f98f38ab7985b21f6ee29dace6a8e7f6e Mon Sep 17 00:00:00 2001
From: reyna-abhyankar <forvirenra@gmail.com>
Date: Tue, 14 May 2024 06:40:39 -0700
Subject: [PATCH 02/24] Computation Graph and Builder

---
 lib/pcg/include/pcg/computation_graph.h       | 103 +++++-
 .../include/pcg/computation_graph_builder.h   | 342 +++++++++---------
 lib/pcg/src/computation_graph_builder.cc      | 306 ++++++++++------
 lib/utils/include/utils/strong_typedef.h      |   4 +
 4 files changed, 475 insertions(+), 280 deletions(-)

diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 11dad70356..4d4fa86efa 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -4,6 +4,8 @@
 #include "layer.h"
 #include "operator_guid_t.h"
 #include "tensor.h"
+#include "tensor_guid_t.h"
+#include "utils/containers.h"
 #include "utils/graph.h"
 #include "utils/strong_typedef.h"
 #include "visit_struct/visit_struct.hpp"
@@ -14,12 +16,105 @@ struct ComputationGraph
     : public strong_typedef<ComputationGraph,
                             OutputLabelledMultiDiGraph<Layer, Tensor>> {
   using strong_typedef::strong_typedef;
+
+  std::vector<operator_guid_t> traverse() {
+    std::vector<Node> layers = get_topological_ordering(this->value());
+    return transform(layers, [&](Node const &e) -> operator_guid_t {
+      return operator_guid_t{e};
+    });
+  }
+
+  std::vector<operator_guid_t> traverse_reverse_order() {
+    std::vector<Node> layers =
+        reversed<std::vector<Node>>(get_topological_ordering(this->value()));
+    return transform(layers, [&](Node const &e) -> operator_guid_t {
+      return operator_guid_t{e};
+    });
+  }
+
+  bool out_edge_comparator(MultiDiOutput x, MultiDiOutput y) {
+    return x.src_idx < y.src_idx;
+  }
+
+  std::vector<tensor_guid_t>
+      sort_edge_set(std::unordered_set<MultiDiEdge> edges) {
+    std::unordered_set<MultiDiOutput> outputs =
+        transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput {
+          return MultiDiOutput(e);
+        });
+    std::vector<MultiDiOutput> sorted_outputs(outputs.begin(), outputs.end());
+    sort(sorted_outputs.begin(), sorted_outputs.end(), out_edge_comparator);
+    return transform(sorted_outputs,
+                     [&](MultiDiOutput const &e) -> tensor_guid_t {
+                       return tensor_guid_t{e};
+                     });
+  }
+
+  std::vector<tensor_guid_t> get_outgoing_tensors(operator_guid_t n) {
+    return sort_edge_set(get_outgoing_edges(this->value(), n.value()));
+  }
+
+  std::vector<tensor_guid_t> get_incoming_tensors(operator_guid_t n) {
+    return sort_edge_set(get_incoming_edges(this->value(), n.value()));
+  }
+
+  operator_guid_t add_node(Layer const &layer) {
+    Node added_node = this->value().add_node(layer);
+    return operator_guid_t{added_node};
+  }
+
+  void add_output(tensor_guid_t const &output, Tensor const &tensor) {
+    this->value().add_output(output.value(), tensor);
+  }
+
+  tensor_guid_t create_outgoing_edge(operator_guid_t node, int idx) {
+    MultiDiOutput edge = {node.value(), NodePort{idx}};
+    return tensor_guid_t{edge};
+  }
+
+  tensor_guid_t create_outgoing_edge_with_label(operator_guid_t node,
+                                                int idx,
+                                                Tensor tensor) {
+    tensor_guid_t tensor_guid = create_outgoing_edge(node, idx);
+    add_output(tensor_guid, tensor);
+    return tensor_guid;
+  }
+
+  void add_incoming_edges(std::vector<tensor_guid_t> const &incoming_edges,
+                          operator_guid_t node) {
+    size_t incoming_edge_dst_port = 0;
+    for (tensor_guid_t input : incoming_edges) {
+      MultiDiOutput input_view = input.value();
+      MultiDiEdge edge = {node.value(),
+                          NodePort{incoming_edge_dst_port++},
+                          input_view.src,
+                          input_view.src_idx};
+      this->value().add_edge(edge);
+    }
+  }
+
+  Layer &at(operator_guid_t const &n) {
+    return this->value().at(n.value());
+  }
+
+  Layer const &at(operator_guid_t const &n) const {
+    return this->value().at(n.value());
+  }
+
+  Tensor &at(tensor_guid_t const &e) {
+    return this->value().at(e.value());
+  }
+
+  Tensor const &at(tensor_guid_t const &e) const {
+    return this->value().at(e.value());
+  }
+
+  CompGraphOperatorAttrs get_layer_attrs(operator_guid_t const &n) const {
+    return this->at(n).attrs;
+  }
 };
+CHECK_WELL_BEHAVED_VALUE_TYPE_NO_HASH(ComputationGraph);
 
 } // namespace FlexFlow
 
-namespace FlexFlow {
-static_assert(is_well_behaved_value_type_no_hash<ComputationGraph>::value, "");
-}
-
 #endif
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 035f0cad0b..1be8d7ad0e 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_PCG_INCLUDE_PCG_COMPUTATION_GRAPH_BUILDER_H
 
 #include "computation_graph.h"
+#include "optimizer.h"
 
 namespace FlexFlow {
 
@@ -12,74 +13,78 @@ struct ComputationGraphBuilder
 
   // C++ APIs for constructing models
   // Add an exp layer
-  Tensor exp(Tensor const &,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t exp(tensor_guid_t const &,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add an add layer
-  Tensor add(Tensor const &x,
-             Tensor const &y,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t add(tensor_guid_t const &x,
+                    tensor_guid_t const &y,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add a subtract layer
-  Tensor subtract(Tensor const &x,
-                  Tensor const &y,
-                  std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t subtract(tensor_guid_t const &x,
+                         tensor_guid_t const &y,
+                         std::optional<std::string> const &name = std::nullopt);
   // Add a multiply layer
-  Tensor multiply(Tensor const &x,
-                  Tensor const &y,
-                  std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t multiply(tensor_guid_t const &x,
+                         tensor_guid_t const &y,
+                         std::optional<std::string> const &name = std::nullopt);
   // Add a divide layer
-  Tensor divide(Tensor const &x,
-                Tensor const &y,
-                std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t divide(tensor_guid_t const &x,
+                       tensor_guid_t const &y,
+                       std::optional<std::string> const &name = std::nullopt);
   // Add a max layer
-  Tensor max(Tensor const &x,
-             Tensor const &y,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t max(tensor_guid_t const &x,
+                    tensor_guid_t const &y,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add a min layer
-  Tensor min(Tensor const &x,
-             Tensor const &y,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t min(tensor_guid_t const &x,
+                    tensor_guid_t const &y,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add a rsqrt layer
-  Tensor rsqrt(Tensor const &x,
-               std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t rsqrt(tensor_guid_t const &x,
+                      std::optional<std::string> const &name = std::nullopt);
   // Add a pow layer
-  Tensor pow(Tensor const &x,
-             float exponent,
-             std::optional<std::string> const &name = std::nullopt);
-  // Add a scalar multiply layer
-  Tensor scalar_multiply(Tensor const &x,
-                         float scalar,
-                         std::optional<std::string> const &name = std::nullopt);
-  Tensor scalar_add(Tensor const &x,
-                    float scalar,
-                    std::optional<std::string> const &name = std::nullopt);
-  Tensor scalar_sub(Tensor const &lhs,
-                    float rhs,
+  tensor_guid_t pow(tensor_guid_t const &x,
+                    float exponent,
                     std::optional<std::string> const &name = std::nullopt);
-  Tensor scalar_truediv(Tensor const &numerator,
-                        float denominator,
-                        std::optional<std::string> const &name = std::nullopt);
+  // Add a scalar multiply layer
+  tensor_guid_t
+      scalar_multiply(tensor_guid_t const &x,
+                      float scalar,
+                      std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      scalar_add(tensor_guid_t const &x,
+                 float scalar,
+                 std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      scalar_sub(tensor_guid_t const &lhs,
+                 float rhs,
+                 std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      scalar_truediv(tensor_guid_t const &numerator,
+                     float denominator,
+                     std::optional<std::string> const &name = std::nullopt);
   // Add a sin layer
-  Tensor sin(Tensor const &x,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t sin(tensor_guid_t const &x,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add a cos layer
-  Tensor cos(Tensor const &x,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t cos(tensor_guid_t const &x,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add an activation layer
-  Tensor relu(Tensor const &x,
-              std::optional<std::string> const &name = std::nullopt);
-  Tensor identity(Tensor const &x,
-                  std::optional<std::string> const &name = std::nullopt);
-  Tensor gelu(Tensor const &x,
-              std::optional<std::string> const &name = std::nullopt);
-  Tensor sigmoid(Tensor const &x,
-                 std::optional<std::string> const &name = std::nullopt);
-  Tensor tanh(Tensor const &x,
-              std::optional<std::string> const &name = std::nullopt);
-  Tensor elu(Tensor const &x,
-             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t relu(tensor_guid_t const &x,
+                     std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t identity(tensor_guid_t const &x,
+                         std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t gelu(tensor_guid_t const &x,
+                     std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t sigmoid(tensor_guid_t const &x,
+                        std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t tanh(tensor_guid_t const &x,
+                     std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t elu(tensor_guid_t const &x,
+                    std::optional<std::string> const &name = std::nullopt);
   // Add a 2D convolutional layer
-  Tensor conv2d(
-      Tensor const &input,
+  tensor_guid_t conv2d(
+      tensor_guid_t const &input,
       int outChannels,
       int kernelH,
       int kernelW,
@@ -95,13 +100,13 @@ struct ComputationGraphBuilder
       std::optional<RegularizerAttrs> const &kernel_regularizer = std::nullopt,
       std::optional<std::string> const &name = std::nullopt);
   // Add a dropout layer
-  Tensor dropout(Tensor const &input,
-                 float rate,
-                 unsigned long long seed = 0,
-                 std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t dropout(tensor_guid_t const &input,
+                        float rate,
+                        unsigned long long seed = 0,
+                        std::optional<std::string> const &name = std::nullopt);
   // Add an embedding layer
-  Tensor embedding(
-      Tensor const &input,
+  tensor_guid_t embedding(
+      tensor_guid_t const &input,
       int num_entries,
       int outDim,
       AggregateOp aggr,
@@ -109,43 +114,48 @@ struct ComputationGraphBuilder
       std::optional<Initializer> const &kernel_initializer = std::nullopt,
       std::optional<std::string> const &name = std::nullopt);
   // Add a gather layer
-  std::vector<Tensor>
-      gather(Tensor const &input,
-             Tensor const &index,
+  std::vector<tensor_guid_t>
+      gather(tensor_guid_t const &input,
+             tensor_guid_t const &index,
              ff_dim_t dim,
              std::optional<std::string> const &name = std::nullopt);
   // Add a cache layer
-  Tensor cache(Tensor const &input,
-               int num_batches,
-               std::function<float(float *, void const *, void const *, int)>
-                   score_f = {},
-               std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      cache(tensor_guid_t const &input,
+            int num_batches,
+            std::function<float(float *, void const *, void const *, int)>
+                score_f = {},
+            std::optional<std::string> const &name = std::nullopt);
   // Add a 2D pooling layer
-  Tensor pool2d(Tensor const &input,
-                int kernelH,
-                int kernelW,
-                int strideH,
-                int strideW,
-                int paddingH,
-                int paddingW,
-                PoolOp type = PoolOp::MAX,
-                std::optional<Activation> const &activation = std::nullopt,
-                std::optional<std::string> const &name = std::nullopt);
-  Tensor layer_norm(Tensor const &input,
-                    std::vector<int> const &axes,
-                    bool elementwise_affine,
-                    float eps,
-                    std::optional<std::string> const &name = std::nullopt);
-  Tensor batch_norm(Tensor const &input,
-                    bool relu = true,
-                    std::optional<std::string> const &name = std::nullopt);
-  Tensor batch_matmul(Tensor const &A,
-                      Tensor const &B,
-                      int a_seq_length_dim = -1,
-                      int b_seq_length_dim = -1,
-                      std::optional<std::string> const &name = std::nullopt);
-  Tensor
-      dense(Tensor const &input,
+  tensor_guid_t
+      pool2d(tensor_guid_t const &input,
+             int kernelH,
+             int kernelW,
+             int strideH,
+             int strideW,
+             int paddingH,
+             int paddingW,
+             PoolOp type = PoolOp::MAX,
+             std::optional<Activation> const &activation = std::nullopt,
+             std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      layer_norm(tensor_guid_t const &input,
+                 std::vector<int> const &axes,
+                 bool elementwise_affine,
+                 float eps,
+                 std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      batch_norm(tensor_guid_t const &input,
+                 bool relu = true,
+                 std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      batch_matmul(tensor_guid_t const &A,
+                   tensor_guid_t const &B,
+                   int a_seq_length_dim = -1,
+                   int b_seq_length_dim = -1,
+                   std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      dense(tensor_guid_t const &input,
             int outDim,
             std::optional<Activation> activation = std::nullopt,
             bool use_bias = true,
@@ -154,55 +164,59 @@ struct ComputationGraphBuilder
             std::optional<Initializer> const &bias_initializer = std::nullopt,
             std::optional<std::string> const &name = std::nullopt);
   // Add a cast layer
-  Tensor cast(Tensor const &input,
-              DataType dtype,
-              std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t cast(tensor_guid_t const &input,
+                     DataType dtype,
+                     std::optional<std::string> const &name = std::nullopt);
   // Add a concat layer
-  Tensor concat(int n,
-                std::vector<Tensor> const &tensors,
-                int axis,
-                std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t concat(int n,
+                       std::vector<tensor_guid_t> const &tensors,
+                       int axis,
+                       std::optional<std::string> const &name = std::nullopt);
   // Add a mean layer
-  Tensor mean(Tensor const &input,
-              std::vector<int> const &dims,
-              bool keepdims,
-              char const *name);
+  tensor_guid_t mean(tensor_guid_t const &input,
+                     std::vector<int> const &dims,
+                     bool keepdims,
+                     char const *name);
   // Add a split layer
-  void split(Tensor const &input,
-             Tensor *outputs,
+  void split(tensor_guid_t const &input,
+             tensor_guid_t *outputs,
              std::vector<int> const &split,
              int axis,
              std::optional<std::string> const &name = std::nullopt);
   // Add a flat layer
-  Tensor flat(Tensor const &input,
-              std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t flat(tensor_guid_t const &input,
+                     std::optional<std::string> const &name = std::nullopt);
   // Add a softmax layer
-  Tensor softmax(Tensor const &input,
-                 int dim = -1,
-                 std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t softmax(tensor_guid_t const &input,
+                        int dim = -1,
+                        std::optional<std::string> const &name = std::nullopt);
   // Create input tensors and constants
-  Tensor transpose(Tensor const &input,
-                   std::vector<int> const &perm,
-                   std::optional<std::string> const &name = std::nullopt);
-  Tensor reduce_sum(Tensor const &input,
-                    std::vector<int> const &axes,
-                    bool keepdims = false,
-                    std::optional<std::string> const &name = std::nullopt);
-  Tensor reshape(Tensor const &input,
-                 std::vector<int> const &shape,
-                 std::optional<std::string> const &name = std::nullopt);
-  Tensor reverse(Tensor const &input,
-                 int axis,
+  tensor_guid_t input(Tensor const &input_tensor,
+                      std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      transpose(tensor_guid_t const &input,
+                std::vector<int> const &perm,
+                std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      reduce_sum(tensor_guid_t const &input,
+                 std::vector<int> const &axes,
+                 bool keepdims = false,
                  std::optional<std::string> const &name = std::nullopt);
-  void top_k(Tensor const &input,
-             Tensor *outputs,
+  tensor_guid_t reshape(tensor_guid_t const &input,
+                        std::vector<int> const &shape,
+                        std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t reverse(tensor_guid_t const &input,
+                        int axis,
+                        std::optional<std::string> const &name = std::nullopt);
+  void top_k(tensor_guid_t const &input,
+             tensor_guid_t *outputs,
              int k,
              bool sorted,
              std::optional<std::string> const &name = std::nullopt);
-  Tensor multihead_attention(
-      Tensor const &query,
-      Tensor const &key,
-      Tensor const &value,
+  tensor_guid_t multihead_attention(
+      tensor_guid_t const &query,
+      tensor_guid_t const &key,
+      tensor_guid_t const &value,
       int embed_dim,
       int num_heads,
       int kdim = 0,
@@ -213,62 +227,66 @@ struct ComputationGraphBuilder
       bool add_zero_attn = false,
       std::optional<Initializer> initializer = std::nullopt,
       std::optional<std::string> const &name = std::nullopt);
-  Tensor create_tensor(TensorShape const &, bool create_grad = true);
+  tensor_guid_t create_tensor(TensorShape const &, bool create_grad = true);
   Parameter create_weight(
       TensorShape const &,
       bool create_grad = true,
       std::optional<Initializer> const &initializer = std::nullopt,
       std::optional<ParamSync> sync_type = std::nullopt);
 
-  std::vector<Tensor> get_outputs(Layer const &) const;
-  Tensor get_output(Layer const &, int idx) const;
-
-  Tensor at(MultiDiEdge const &) const;
-  Layer at(Node const &) const;
+  std::vector<tensor_guid_t> get_outputs(operator_guid_t const &) const;
+  tensor_guid_t get_output(operator_guid_t const &, int idx) const;
+  Tensor get_tensor(tensor_guid_t const &) const;
 
 private:
-  Tensor broadcast(Tensor const &, TensorShape const &);
+  tensor_guid_t broadcast(tensor_guid_t const &, TensorShape const &);
 
   void add_layer(Layer const &layer,
-                 std::vector<Tensor> const &inputs,
-                 std::vector<Tensor> const &weights,
-                 std::vector<Tensor> const &outputs);
-  Tensor add_layer(
+                 std::vector<tensor_guid_t> const &inputs,
+                 std::vector<tensor_guid_t> const &weights,
+                 std::vector<tensor_guid_t> const &outputs);
+  tensor_guid_t add_layer(
       Layer const &layer,
-      std::vector<Tensor> const &inputs,
+      std::vector<tensor_guid_t> const &inputs,
       std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
           &weight_shapes,
-      TensorShape const &output_shape);
-  std::vector<Tensor> add_layer(
+      Tensor const &output);
+  std::vector<tensor_guid_t> add_layer(
       Layer const &layer,
-      std::vector<Tensor> const &inputs,
+      std::vector<tensor_guid_t> const &inputs,
       std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
           &weight_shapes,
-      std::vector<TensorShape> const &output_shapes);
+      std::vector<Tensor> const &outputs);
 
-  Tensor as_type(Tensor const &, DataType, std::string const &);
+  tensor_guid_t as_type(tensor_guid_t const &, DataType, std::string const &);
 
   TensorShape get_broadcast_target_shape(std::vector<TensorShape> const &);
 
-  Tensor element_binary(OperatorType,
-                        Tensor const &lhs,
-                        Tensor const &rhs,
-                        std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      element_binary(OperatorType,
+                     tensor_guid_t const &lhs,
+                     tensor_guid_t const &rhs,
+                     std::optional<std::string> const &name = std::nullopt);
 
-  Tensor element_unary(OperatorType,
-                       Tensor const &input,
-                       std::optional<std::string> const &name = std::nullopt);
-  Tensor element_scalar_unary(
+  tensor_guid_t
+      element_unary(OperatorType,
+                    tensor_guid_t const &input,
+                    std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t element_scalar_unary(
       OperatorType,
-      Tensor const &input,
+      tensor_guid_t const &input,
       float scalar,
       std::optional<std::string> const &name = std::nullopt);
-  Tensor element_unary(ElementUnaryAttrs const &,
-                       Tensor const &input,
-                       std::optional<std::string> const &name = std::nullopt);
-  Tensor element_scalar_unary(ElementScalarUnaryAttrs const &attrs,
-                              Tensor const &x,
-                              std::optional<std::string> const &maybe_name);
+  tensor_guid_t
+      element_unary(ElementUnaryAttrs const &,
+                    tensor_guid_t const &input,
+                    std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      element_scalar_unary(ElementScalarUnaryAttrs const &attrs,
+                           tensor_guid_t const &x,
+                           std::optional<std::string> const &maybe_name);
+
+  std::unordered_map<tensor_guid_t, Tensor> pre_edge_mapping;
 
 public:
   ComputationGraph computation_graph;
diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc
index c2e008231e..f308a4b242 100644
--- a/lib/pcg/src/computation_graph_builder.cc
+++ b/lib/pcg/src/computation_graph_builder.cc
@@ -6,40 +6,48 @@
 
 namespace FlexFlow {
 
-void ComputationGraphBuilder::add_layer(Layer const &layer,
-                                        std::vector<Tensor> const &inputs,
-                                        std::vector<Tensor> const &weights,
-                                        std::vector<Tensor> const &outputs) {
-  NOT_IMPLEMENTED();
-}
-Tensor ComputationGraphBuilder::add_layer(
+tensor_guid_t ComputationGraphBuilder::add_layer(
     Layer const &layer,
-    std::vector<Tensor> const &inputs,
+    std::vector<tensor_guid_t> const &inputs,
     std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
-    TensorShape const &output_shape) {
-  NOT_IMPLEMENTED();
+    Tensor const &output) {
+  operator_guid_t node = computation_graph.add_node(layer);
+  this->computation_graph.add_incoming_edges(inputs, node);
+  return this->computation_graph.create_outgoing_edge_with_label(
+      node, 0, output);
 }
-std::vector<Tensor> ComputationGraphBuilder::add_layer(
+
+std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
     Layer const &layer,
-    std::vector<Tensor> const &inputs,
+    std::vector<tensor_guid_t> const &inputs,
     std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
-    std::vector<TensorShape> const &output_shapes) {
-  NOT_IMPLEMENTED();
+    std::vector<Tensor> const &outputs) {
+  operator_guid_t node = computation_graph.add_node(layer);
+  this->computation_graph.add_incoming_edges(inputs, node);
+  std::vector<tensor_guid_t> output_tensor_guids;
+  for (int i = 0; i < outputs.size(); ++i) {
+    output_tensor_guids.push_back(
+        this->computation_graph.create_outgoing_edge_with_label(
+            node, i, outputs[i]));
+  }
+  return output_tensor_guids;
 }
 
-Tensor ComputationGraphBuilder::broadcast(Tensor const &, TensorShape const &) {
+tensor_guid_t ComputationGraphBuilder::broadcast(tensor_guid_t const &,
+                                                 TensorShape const &) {
   NOT_IMPLEMENTED();
 }
-Tensor ComputationGraphBuilder::cast(Tensor const &input,
-                                     DataType dtype,
-                                     std::optional<std::string> const &name){
-    NOT_IMPLEMENTED()}
+tensor_guid_t
+    ComputationGraphBuilder::cast(tensor_guid_t const &input,
+                                  DataType dtype,
+                                  std::optional<std::string> const &name){
+        NOT_IMPLEMENTED()}
 
-Tensor ComputationGraphBuilder::as_type(Tensor const &x,
-                                        DataType data_type,
-                                        std::string const &name) {
+tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x,
+                                               DataType data_type,
+                                               std::string const &name) {
   if (x.data_type < data_type) {
     return this->cast(x, data_type, name);
   } else if (x.data_type > data_type) {
@@ -64,13 +72,14 @@ static std::string get_default_name(std::variant<Args...> const &attrs) {
   return get_default_name(widen<ComputationGraphAttrs>(attrs));
 }
 
-Tensor ComputationGraphBuilder::element_unary(
+tensor_guid_t ComputationGraphBuilder::element_unary(
     ElementUnaryAttrs const &attrs,
-    Tensor const &x,
+    tensor_guid_t const &x,
     std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
-  Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
+  tensor_guid_t input =
+      this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   Layer layer = {attrs, name};
   TensorShape output_shape = get_output_shape(attrs, input);
@@ -78,13 +87,14 @@ Tensor ComputationGraphBuilder::element_unary(
   return this->add_layer(layer, {input}, {}, output_shape);
 }
 
-Tensor ComputationGraphBuilder::element_scalar_unary(
+tensor_guid_t ComputationGraphBuilder::element_scalar_unary(
     ElementScalarUnaryAttrs const &attrs,
-    Tensor const &x,
+    tensor_guid_t const &x,
     std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
-  Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
+  tensor_guid_t input =
+      this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   Layer layer = {attrs, name};
   TensorShape output_shape = get_output_shape(attrs, input);
@@ -92,39 +102,41 @@ Tensor ComputationGraphBuilder::element_scalar_unary(
   return this->add_layer(layer, {input}, {}, output_shape);
 }
 
-Tensor ComputationGraphBuilder::element_unary(
+tensor_guid_t ComputationGraphBuilder::element_unary(
     OperatorType op_type,
-    Tensor const &input,
+    tensor_guid_t const &input,
     std::optional<std::string> const &name) {
   ElementUnaryAttrs attrs = {op_type};
   return this->element_unary(attrs, input, name);
 }
 
-Tensor ComputationGraphBuilder::element_scalar_unary(
+tensor_guid_t ComputationGraphBuilder::element_scalar_unary(
     OperatorType op_type,
-    Tensor const &input,
+    tensor_guid_t const &input,
     float scalar,
     std::optional<std::string> const &name) {
   ElementScalarUnaryAttrs attrs = {op_type, scalar};
   return this->element_scalar_unary(attrs, input, name);
 }
 
-Tensor ComputationGraphBuilder::element_binary(
+tensor_guid_t ComputationGraphBuilder::element_binary(
     OperatorType op_type,
-    Tensor const &lhs,
-    Tensor const &rhs,
+    tensor_guid_t const &lhs,
+    tensor_guid_t const &rhs,
     std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(op_type));
 
   TensorShape compute_shape = this->get_broadcast_target_shape({lhs, rhs});
   DataType compute_type = std::max(lhs.data_type, rhs.data_type);
 
-  Tensor const lhs_input = this->as_type(this->broadcast(lhs, compute_shape),
-                                         compute_type,
-                                         name + "_inputl_pre_cast");
-  Tensor const rhs_input = this->as_type(this->broadcast(rhs, compute_shape),
-                                         compute_type,
-                                         name + "_inputr_pre_cast");
+  tensor_guid_t const lhs_input =
+      this->as_type(this->broadcast(lhs, compute_shape),
+                    compute_type,
+                    name + "_inputl_pre_cast");
+  tensor_guid_t const rhs_input =
+      this->as_type(this->broadcast(rhs, compute_shape),
+                    compute_type,
+                    name + "_inputr_pre_cast");
 
   ElementBinaryAttrs attrs = {op_type, compute_type, false, false};
 
@@ -134,127 +146,179 @@ Tensor ComputationGraphBuilder::element_binary(
   return this->add_layer(layer, {lhs_input, rhs_input}, {}, output_shape);
 }
 
-Tensor ComputationGraphBuilder::exp(Tensor const &input,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t ComputationGraphBuilder::dense(
+    tensor_guid_t const &input,
+    int outDim,
+    std::optional<Activation> activation,
+    bool use_bias,
+    DataType data_type,
+    std::optional<Initializer> const &kernel_initializer,
+    std::optional<Initializer> const &bias_initializer,
+    std::optional<std::string> const &name) {
+  LinearAttrs attrs = {
+      outDim, use_bias, data_type, activation.value(), std::nullopt};
+  std::string unwrapped_name = name.value_or(get_default_name(attrs));
+
+  tensor_guid_t input_recast =
+      this->as_type(input, data_type, unwrapped_name + "input_recast");
+
+  Layer layer = {attrs, name};
+  TensorShape output_shape = get_output_shape(attrs, input_recast);
+  Tensor output = {
+      output_shape.dims, data_type, std::nullopt, false, std::nullopt};
+
+  std::vector<std::pair<TensorShape, std::optional<Initializer>>> weights;
+
+  weights.push_back(
+      {get_weights_shape(attrs, input_recast), kernel_initializer});
+
+  if (use_bias) {
+    weights.push_back({get_bias_shape(attrs, input_recast), bias_initializer});
+  }
+
+  return this->add_layer(layer, {input_recast}, weights, output);
+}
+
+tensor_guid_t
+    ComputationGraphBuilder::exp(tensor_guid_t const &input,
+                                 std::optional<std::string> const &name) {
   return this->element_unary(Op::EXP, input, name);
 }
 
-Tensor ComputationGraphBuilder::add(Tensor const &lhs,
-                                    Tensor const &rhs,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::add(tensor_guid_t const &lhs,
+                                 tensor_guid_t const &rhs,
+                                 std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_ADD, lhs, rhs, name);
 }
 
-Tensor
-    ComputationGraphBuilder::subtract(Tensor const &lhs,
-                                      Tensor const &rhs,
+tensor_guid_t
+    ComputationGraphBuilder::subtract(tensor_guid_t const &lhs,
+                                      tensor_guid_t const &rhs,
                                       std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_SUB, lhs, rhs, name);
 }
 
-Tensor
-    ComputationGraphBuilder::multiply(Tensor const &lhs,
-                                      Tensor const &rhs,
+tensor_guid_t
+    ComputationGraphBuilder::multiply(tensor_guid_t const &lhs,
+                                      tensor_guid_t const &rhs,
                                       std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_MUL, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::divide(Tensor const &lhs,
-                                       Tensor const &rhs,
-                                       std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::divide(tensor_guid_t const &lhs,
+                                    tensor_guid_t const &rhs,
+                                    std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_DIV, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::max(Tensor const &lhs,
-                                    Tensor const &rhs,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::max(tensor_guid_t const &lhs,
+                                 tensor_guid_t const &rhs,
+                                 std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_MAX, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::min(Tensor const &lhs,
-                                    Tensor const &rhs,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::min(tensor_guid_t const &lhs,
+                                 tensor_guid_t const &rhs,
+                                 std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_MIN, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::rsqrt(Tensor const &input,
-                                      std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::rsqrt(tensor_guid_t const &input,
+                                   std::optional<std::string> const &name) {
   return this->element_unary(Op::RSQRT, input, name);
 }
 
-Tensor ComputationGraphBuilder::pow(Tensor const &input,
-                                    float exponent,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::pow(tensor_guid_t const &input,
+                                 float exponent,
+                                 std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::POW, input, exponent, name);
 }
 
-Tensor ComputationGraphBuilder::scalar_multiply(
-    Tensor const &input, float scalar, std::optional<std::string> const &name) {
+tensor_guid_t ComputationGraphBuilder::scalar_multiply(
+    tensor_guid_t const &input,
+    float scalar,
+    std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::SCALAR_MULTIPLY, input, scalar, name);
 }
 
-Tensor ComputationGraphBuilder::scalar_add(
-    Tensor const &input, float scalar, std::optional<std::string> const &name) {
+tensor_guid_t ComputationGraphBuilder::scalar_add(
+    tensor_guid_t const &input,
+    float scalar,
+    std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::SCALAR_ADD, input, scalar, name);
 }
 
-Tensor ComputationGraphBuilder::scalar_sub(
-    Tensor const &lhs, float rhs, std::optional<std::string> const &name) {
+tensor_guid_t ComputationGraphBuilder::scalar_sub(
+    tensor_guid_t const &lhs,
+    float rhs,
+    std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::SCALAR_SUB, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::scalar_truediv(
-    Tensor const &numerator,
+tensor_guid_t ComputationGraphBuilder::scalar_truediv(
+    tensor_guid_t const &numerator,
     float denominator,
     std::optional<std::string> const &name) {
   return this->element_scalar_unary(
       Op::SCALAR_TRUE_DIV, numerator, denominator, name);
 }
 
-Tensor ComputationGraphBuilder::sin(Tensor const &input,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::sin(tensor_guid_t const &input,
+                                 std::optional<std::string> const &name) {
   return this->element_unary(Op::SIN, input, name);
 }
 
-Tensor ComputationGraphBuilder::cos(Tensor const &input,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::cos(tensor_guid_t const &input,
+                                 std::optional<std::string> const &name) {
   return this->element_unary(Op::COS, input, name);
 }
 
-Tensor ComputationGraphBuilder::relu(Tensor const &input,
-                                     std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::relu(tensor_guid_t const &input,
+                                  std::optional<std::string> const &name) {
   return this->element_unary(Op::RELU, input, name);
 }
 
-Tensor
-    ComputationGraphBuilder::identity(Tensor const &input,
+tensor_guid_t
+    ComputationGraphBuilder::identity(tensor_guid_t const &input,
                                       std::optional<std::string> const &name) {
   return this->element_unary(Op::IDENTITY, input, name);
 }
 
-Tensor ComputationGraphBuilder::gelu(Tensor const &input,
-                                     std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::gelu(tensor_guid_t const &input,
+                                  std::optional<std::string> const &name) {
   return this->element_unary(Op::GELU, input, name);
 }
 
-Tensor
-    ComputationGraphBuilder::sigmoid(Tensor const &input,
+tensor_guid_t
+    ComputationGraphBuilder::sigmoid(tensor_guid_t const &input,
                                      std::optional<std::string> const &name) {
   return this->element_unary(Op::SIGMOID, input, name);
 }
 
-Tensor ComputationGraphBuilder::tanh(Tensor const &input,
-                                     std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::tanh(tensor_guid_t const &input,
+                                  std::optional<std::string> const &name) {
   return this->element_unary(Op::TANH, input, name);
 }
 
-Tensor ComputationGraphBuilder::elu(Tensor const &input,
-                                    std::optional<std::string> const &name) {
+tensor_guid_t
+    ComputationGraphBuilder::elu(tensor_guid_t const &input,
+                                 std::optional<std::string> const &name) {
   return this->element_unary(Op::ELU, input, name);
 }
 
-Tensor ComputationGraphBuilder::conv2d(
-    Tensor const &x,
+tensor_guid_t ComputationGraphBuilder::conv2d(
+    tensor_guid_t const &x,
     int outChannels,
     int kernelH,
     int kernelW,
@@ -281,7 +345,8 @@ Tensor ComputationGraphBuilder::conv2d(
                        use_bias};
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
-  Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
+  tensor_guid_t input =
+      this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   Layer layer = {attrs, name};
   TensorShape output_shape = get_output_shape(attrs, input);
@@ -297,8 +362,8 @@ Tensor ComputationGraphBuilder::conv2d(
   return this->add_layer(layer, {input}, weights, output_shape);
 }
 
-Tensor ComputationGraphBuilder::dropout(
-    Tensor const &x,
+tensor_guid_t ComputationGraphBuilder::dropout(
+    tensor_guid_t const &x,
     float rate,
     unsigned long long seed,
     std::optional<std::string> const &maybe_name) {
@@ -306,15 +371,16 @@ Tensor ComputationGraphBuilder::dropout(
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
   Layer layer = {attrs, name};
-  Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
+  tensor_guid_t input =
+      this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   TensorShape output_shape = get_output_shape(attrs, input);
 
   return this->add_layer(layer, {input}, {}, output_shape);
 }
 
-Tensor ComputationGraphBuilder::embedding(
-    Tensor const &x,
+tensor_guid_t ComputationGraphBuilder::embedding(
+    tensor_guid_t const &x,
     int num_entries,
     int outDim,
     AggregateOp aggr,
@@ -325,7 +391,8 @@ Tensor ComputationGraphBuilder::embedding(
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
   Layer layer = {attrs, name};
-  Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
+  tensor_guid_t input =
+      this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   TensorShape output_shape = get_output_shape(attrs, input);
   TensorShape weights_shape = get_weights_shape(attrs, input);
@@ -334,9 +401,9 @@ Tensor ComputationGraphBuilder::embedding(
       layer, {input}, {{weights_shape, kernel_initializer}}, output_shape);
 }
 
-std::vector<Tensor> ComputationGraphBuilder::gather(
-    Tensor const &input,
-    Tensor const &index,
+std::vector<tensor_guid_t> ComputationGraphBuilder::gather(
+    tensor_guid_t const &input,
+    tensor_guid_t const &index,
     ff_dim_t dim,
     std::optional<std::string> const &maybe_name) {
   GatherAttrs attrs = {dim};
@@ -357,19 +424,30 @@ std::vector<Tensor> ComputationGraphBuilder::gather(
   return this->add_layer(layer, {input}, {}, output_shapes);
 }
 
-TensorShape get_shape(Tensor const &t) {
-  return t.get_shape();
+tensor_guid_t
+    ComputationGraphBuilder::input(Tensor const &input_tensor,
+                                   std::optional<std::string> const &name) {
+  InputAttrs input_attrs = {};
+  std::string name = name.value_or(get_default_name(input_attrs));
+
+  Layer layer = {attrs, name};
+
+  return this->add_layer(layer, {}, {}, input_tensor);
+}
+
+TensorShape get_shape(tensor_guid_t const &t) {
+  return this->computation_graph.at(t).get_shape();
 }
-std::vector<TensorShape> get_shape(std::vector<Tensor> const &) {
+std::vector<TensorShape> get_shape(std::vector<tensor_guid_t> const &) {
   NOT_IMPLEMENTED();
 }
 
-// Tensor ComputationGraphBuilder::aggregate(
-//     Tensor const &gate_preds,
-//     Tensor const &gate_assign,
-//     Tensor const &true_gate_assign,
-//     Tensor const &full_gate_gradients,
-//     std::vector<Tensor> const &exp_preds,
+// tensor_guid_t ComputationGraphBuilder::aggregate(
+//     tensor_guid_t const &gate_preds,
+//     tensor_guid_t const &gate_assign,
+//     tensor_guid_t const &true_gate_assign,
+//     tensor_guid_t const &full_gate_gradients,
+//     std::vector<tensor_guid_t> const &exp_preds,
 //     int n,
 //     float lambda_bal,
 //     std::optional<std::string> const &maybe_name) {
@@ -384,14 +462,14 @@ std::vector<TensorShape> get_shape(std::vector<Tensor> const &) {
 //                                               get_shape(full_gate_gradients),
 //                                               get_shape(exp_preds));
 
-//   std::vector<Tensor> inputs = {
+//   std::vector<tensor_guid_t> inputs = {
 //       gate_preds, gate_assign, true_gate_assign, full_gate_gradients};
 //   extend(inputs, exp_preds);
 //   return this->add_layer(layer, inputs, {}, output_shape);
 // }
 
-Tensor ComputationGraphBuilder::batch_norm(
-    Tensor const &input,
+tensor_guid_t ComputationGraphBuilder::batch_norm(
+    tensor_guid_t const &input,
     bool relu,
     std::optional<std::string> const &maybe_name) {
   BatchNormAttrs attrs = BatchNormAttrs{relu};
diff --git a/lib/utils/include/utils/strong_typedef.h b/lib/utils/include/utils/strong_typedef.h
index f700a20c79..63c41e0e5e 100644
--- a/lib/utils/include/utils/strong_typedef.h
+++ b/lib/utils/include/utils/strong_typedef.h
@@ -65,6 +65,10 @@ class strong_typedef {
     return value_;
   }
 
+  T &value() noexcept {
+    return value_;
+  }
+
   template <typename F>
   strong_typedef fmap(F const &f) {
     static_assert(

From 9a59f34fded4af8f2061c1e95d4ee4679340404e Mon Sep 17 00:00:00 2001
From: reyna-abhyankar <forvirenra@gmail.com>
Date: Tue, 14 May 2024 06:47:15 -0700
Subject: [PATCH 03/24] Shift ops and remove legion names

---
 .../src/ops/attention.cc                      | 110 +++++----
 .../src/ops/attention.h                       |   2 +-
 .../src/ops/batch_matmul.cc                   |  36 +--
 .../src/ops/batch_matmul.h                    |   4 +-
 .../src/ops/batch_norm.cc                     |  69 ++----
 .../src/ops/batch_norm.h                      |   2 +-
 .../src/ops/cast.cc                           |  37 +---
 .../src/ops/cast.h                            |   2 +-
 .../src/ops/combine.cc                        |  34 +--
 .../src/ops/combine.h                         |   2 +-
 .../src/ops/concat.cc                         |  41 +---
 .../src/ops/concat.h                          |   2 +-
 .../src/ops/conv_2d.cc                        |  76 ++-----
 .../src/ops/conv_2d.h                         |   2 +-
 .../src/ops/dropout.cc                        |  55 ++---
 .../src/ops/dropout.h                         |   2 +-
 .../src/ops/element_binary.cc                 |  64 ++----
 .../src/ops/element_binary.h                  |   0
 .../src/ops/element_unary.cc                  |  61 ++---
 .../src/ops/element_unary.h                   |   3 -
 .../src/ops/embedding.cc                      |  38 +---
 .../src/ops/embedding.h                       |   2 +-
 .../src/ops/flat.cc                           |  32 +--
 .../src/ops/flat.h                            |   0
 .../src/ops/layer_norm.cc                     | 135 +++++-------
 .../src/ops/layer_norm.h                      |   0
 .../src/ops/linear.cc                         | 208 ++++++++----------
 .../src/ops/linear.h                          |   0
 .../src/ops/noop.cc                           |   2 +-
 .../src/ops/noop.h                            |   2 +-
 .../src/ops/parallel_op.h                     |   2 +-
 .../src/ops/partition.cc                      | 119 ++++------
 .../src/ops/pool_2d.cc                        | 142 +++++-------
 .../src/ops/pool_2d.h                         |   2 +-
 .../src/ops/reduce.cc                         | 108 ++++-----
 .../src/ops/reduce.h                          |   2 +-
 .../src/ops/reduction.cc                      |  95 +++-----
 .../src/ops/reduction.h                       |   4 +-
 .../src/ops/repartition.h                     |   2 +-
 .../src/ops/replicate.cc                      |  75 +++----
 .../src/ops/replicate.h                       |   2 +-
 .../src/ops/reshape.cc                        |  80 ++-----
 .../src/ops/reshape.h                         |   2 +-
 .../src/ops/reverse.cc                        |  80 +++----
 .../src/ops/reverse.h                         |   2 +-
 .../src/ops/softmax.cc                        |  96 +++-----
 .../src/ops/softmax.h                         |   2 +-
 .../src/ops/split.cc                          | 110 ++++-----
 .../src/ops/split.h                           |   4 +-
 .../src/ops/topk.cc                           | 136 +++++-------
 .../src/ops/topk.h                            |   2 +-
 .../src/ops/transpose.cc                      | 114 ++++------
 .../src/ops/transpose.h                       |   2 +-
 53 files changed, 782 insertions(+), 1424 deletions(-)
 rename lib/{runtime => local-execution}/src/ops/attention.cc (84%)
 rename lib/{runtime => local-execution}/src/ops/attention.h (96%)
 rename lib/{runtime => local-execution}/src/ops/batch_matmul.cc (88%)
 rename lib/{runtime => local-execution}/src/ops/batch_matmul.h (90%)
 rename lib/{runtime => local-execution}/src/ops/batch_norm.cc (81%)
 rename lib/{runtime => local-execution}/src/ops/batch_norm.h (95%)
 rename lib/{runtime => local-execution}/src/ops/cast.cc (80%)
 rename lib/{runtime => local-execution}/src/ops/cast.h (97%)
 rename lib/{runtime => local-execution}/src/ops/combine.cc (77%)
 rename lib/{runtime => local-execution}/src/ops/combine.h (94%)
 rename lib/{runtime => local-execution}/src/ops/concat.cc (79%)
 rename lib/{runtime => local-execution}/src/ops/concat.h (95%)
 rename lib/{runtime => local-execution}/src/ops/conv_2d.cc (78%)
 rename lib/{runtime => local-execution}/src/ops/conv_2d.h (95%)
 rename lib/{runtime => local-execution}/src/ops/dropout.cc (77%)
 rename lib/{runtime => local-execution}/src/ops/dropout.h (95%)
 rename lib/{runtime => local-execution}/src/ops/element_binary.cc (81%)
 rename lib/{runtime => local-execution}/src/ops/element_binary.h (100%)
 rename lib/{runtime => local-execution}/src/ops/element_unary.cc (77%)
 rename lib/{runtime => local-execution}/src/ops/element_unary.h (91%)
 rename lib/{runtime => local-execution}/src/ops/embedding.cc (82%)
 rename lib/{runtime => local-execution}/src/ops/embedding.h (94%)
 rename lib/{runtime => local-execution}/src/ops/flat.cc (78%)
 rename lib/{runtime => local-execution}/src/ops/flat.h (100%)
 rename lib/{runtime => local-execution}/src/ops/layer_norm.cc (62%)
 rename lib/{runtime => local-execution}/src/ops/layer_norm.h (100%)
 rename lib/{runtime => local-execution}/src/ops/linear.cc (55%)
 rename lib/{runtime => local-execution}/src/ops/linear.h (100%)
 rename lib/{runtime => local-execution}/src/ops/noop.cc (96%)
 rename lib/{runtime => local-execution}/src/ops/noop.h (89%)
 rename lib/{runtime => local-execution}/src/ops/parallel_op.h (96%)
 rename lib/{runtime => local-execution}/src/ops/partition.cc (59%)
 rename lib/{runtime => local-execution}/src/ops/pool_2d.cc (58%)
 rename lib/{runtime => local-execution}/src/ops/pool_2d.h (97%)
 rename lib/{runtime => local-execution}/src/ops/reduce.cc (60%)
 rename lib/{runtime => local-execution}/src/ops/reduce.h (98%)
 rename lib/{runtime => local-execution}/src/ops/reduction.cc (58%)
 rename lib/{runtime => local-execution}/src/ops/reduction.h (96%)
 rename lib/{runtime => local-execution}/src/ops/repartition.h (98%)
 rename lib/{runtime => local-execution}/src/ops/replicate.cc (65%)
 rename lib/{runtime => local-execution}/src/ops/replicate.h (95%)
 rename lib/{runtime => local-execution}/src/ops/reshape.cc (68%)
 rename lib/{runtime => local-execution}/src/ops/reshape.h (98%)
 rename lib/{runtime => local-execution}/src/ops/reverse.cc (69%)
 rename lib/{runtime => local-execution}/src/ops/reverse.h (95%)
 rename lib/{runtime => local-execution}/src/ops/softmax.cc (67%)
 rename lib/{runtime => local-execution}/src/ops/softmax.h (98%)
 rename lib/{runtime => local-execution}/src/ops/split.cc (67%)
 rename lib/{runtime => local-execution}/src/ops/split.h (95%)
 rename lib/{runtime => local-execution}/src/ops/topk.cc (59%)
 rename lib/{runtime => local-execution}/src/ops/topk.h (98%)
 rename lib/{runtime => local-execution}/src/ops/transpose.cc (55%)
 rename lib/{runtime => local-execution}/src/ops/transpose.h (98%)

diff --git a/lib/runtime/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
similarity index 84%
rename from lib/runtime/src/ops/attention.cc
rename to lib/local-execution/src/ops/attention.cc
index 41905f9014..854213a955 100644
--- a/lib/runtime/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -15,19 +15,12 @@
 
 #include "attention.h"
 #include "kernels/attention_kernels.h"
-#include "legion.h"
-#include "op-attrs/ops/attention.h"
-#include "task_spec/op_task_signature.h"
+#include "op_task_signature.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::MultiHeadAttention;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots {
   QUERY_PARALLEL_TENSOR_SHAPE,
   KEY_PARALLEL_TENSOR_SHAPE,
@@ -86,6 +79,12 @@ OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) {
   return {ATTENTION_BWD_TASK_ID, b};
 }
 
+// OpArgBacking
+// generate_op_arg_backing<ATTENTION_INIT_TASK_ID>(std::vector<ParallelTensorShape>
+// tensor_shape_args) {
+
+// }
+
 static DeviceSpecific<MHAPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
@@ -122,35 +121,42 @@ static DeviceSpecific<MHAPerDeviceState>
   int num_samples = get_piece_shape(query_parallel_tensor_shape)[ff_dim_t(2)];
   int num_heads = get_piece_shape(weight_parallel_tensor_shape)[ff_dim_t(1)];
 
+  // MHAPerDeviceState per_device_state =
+  //         init_kernel(handle,
+  //                     allocator,
+  //                     num_samples,
+  //                     num_heads,
+  //                     qSize,
+  //                     kSize,
+  //                     vSize,
+  //                     qProjSize,
+  //                     kProjSize,
+  //                     vProjSize,
+  //                     oProjSize,
+  //                     qoSeqLength,
+  //                     kvSeqLength,
+  //                     attrs.add_bias_kv);
+  // return acc.create_device_specific<MHAPerDeviceState>(per_device_state);
+
   DeviceSpecific<MHAPerDeviceState> per_device_state =
-      acc.create_device_specific<MHAPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      num_samples,
-                      num_heads,
-                      qSize,
-                      kSize,
-                      vSize,
-                      qProjSize,
-                      kProjSize,
-                      vProjSize,
-                      oProjSize,
-                      qoSeqLength,
-                      kvSeqLength,
-                      attrs.add_bias_kv));
+      init_kernel(handle,
+                  allocator,
+                  num_samples,
+                  num_heads,
+                  qSize,
+                  kSize,
+                  vSize,
+                  qProjSize,
+                  kProjSize,
+                  vProjSize,
+                  oProjSize,
+                  qoSeqLength,
+                  kvSeqLength,
+                  attrs.add_bias_kv);
   return per_device_state;
 }
 
-static DeviceSpecific<MHAPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto query = acc.get_tensor<Permissions::RO>(QUERY);
   auto key = acc.get_tensor<Permissions::RO>(KEY);
   auto value = acc.get_tensor<Permissions::RO>(VALUE);
@@ -171,15 +177,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto query = acc.get_tensor<Permissions::RO>(QUERY);
   auto key = acc.get_tensor<Permissions::RO>(KEY);
   auto value = acc.get_tensor<Permissions::RO>(VALUE);
@@ -221,14 +220,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   MultiHeadAttentionAttrs const &attrs,
                                   InputParallelTensorDesc const &query_shape,
@@ -286,7 +277,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<ATTENTION_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
   init.add_arg_slot<ParallelTensorShape>(QUERY_PARALLEL_TENSOR_SHAPE);
   init.add_arg_slot<ParallelTensorShape>(KEY_PARALLEL_TENSOR_SHAPE);
   init.add_arg_slot<ParallelTensorShape>(VALUE_PARALLEL_TENSOR_SHAPE);
@@ -307,12 +299,18 @@ void register_task<ATTENTION_INIT_TASK_ID>() {
   register_task(ATTENTION_INIT_TASK_ID,
                 "Attention Init",
                 init_signature<ATTENTION_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
+}
+
+template <>
+OpTaskSignature get_signature<ATTENTION_INIT_TASK_ID>() {
+  return init_signature<ATTENTION_INIT_TASK_ID>();
 }
 
 template <>
 OpTaskSignature fwd_signature<ATTENTION_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(QUERY);
   fwd.add_input_slot(KEY);
@@ -331,13 +329,13 @@ void register_task<ATTENTION_FWD_TASK_ID>() {
   register_task(ATTENTION_FWD_TASK_ID,
                 "Attention Fwd",
                 fwd_signature<ATTENTION_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
 OpTaskSignature bwd_signature<ATTENTION_BWD_TASK_ID>() {
   OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(ATTENTION_FWD_TASK_ID));
+      infer_bwd_signature(fwd_signature<ATTENTION_FWD_TASK_ID>());
 
   return bwd;
 }
@@ -347,7 +345,7 @@ void register_task<ATTENTION_BWD_TASK_ID>() {
   register_task(ATTENTION_BWD_TASK_ID,
                 "Attention Bwd",
                 bwd_signature<ATTENTION_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/attention.h b/lib/local-execution/src/ops/attention.h
similarity index 96%
rename from lib/runtime/src/ops/attention.h
rename to lib/local-execution/src/ops/attention.h
index 09a4ef036f..601d8a4796 100644
--- a/lib/runtime/src/ops/attention.h
+++ b/lib/local-execution/src/ops/attention.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_ATTENTION_H
 
 #include "op-attrs/ops/attention.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
similarity index 88%
rename from lib/runtime/src/ops/batch_matmul.cc
rename to lib/local-execution/src/ops/batch_matmul.cc
index 5f40def699..c5df564afd 100644
--- a/lib/runtime/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -15,20 +15,14 @@
 
 #include "batch_matmul.h"
 #include "kernels/batch_matmul_kernels.h"
-#include "legion.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/batch_matmul.h"
-#include "task_spec/op_task_signature.h"
+#include "op_task_signature.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::BatchMatmul;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots {
   A_INPUT, // tensor
   B_INPUT, // tensor
@@ -60,7 +54,7 @@ OpTaskInvocation backward(BatchMatmulAttrs const &attrs) {
   return {BATCHMATMUL_BWD_TASK_ID, bwd};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto a_input = acc.get_tensor<Permissions::RO>(A_INPUT);
   auto b_input = acc.get_tensor<Permissions::RO>(B_INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -105,15 +99,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  iter_config.seq_length);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   // BatchMatmul* bmm = (BatchMatmul*) task->args;
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
@@ -165,14 +152,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  batch);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   BatchMatmulAttrs const &attrs,
                                   InputParallelTensorDesc const &a_input,
@@ -208,7 +187,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<BATCHMATMUL_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(A_INPUT);
   fwd.add_input_slot(B_INPUT);
@@ -225,7 +205,7 @@ void register_task<BATCHMATMUL_FWD_TASK_ID>() {
   register_task(BATCHMATMUL_FWD_TASK_ID,
                 "BatchMatmul Fwd",
                 fwd_signature<BATCHMATMUL_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -241,7 +221,7 @@ void register_task<BATCHMATMUL_BWD_TASK_ID>() {
   register_task(BATCHMATMUL_BWD_TASK_ID,
                 "BatchMatmul Bwd",
                 bwd_signature<BATCHMATMUL_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h
similarity index 90%
rename from lib/runtime/src/ops/batch_matmul.h
rename to lib/local-execution/src/ops/batch_matmul.h
index 7d3f2308da..6791b11a8c 100644
--- a/lib/runtime/src/ops/batch_matmul.h
+++ b/lib/local-execution/src/ops/batch_matmul.h
@@ -2,9 +2,9 @@
 #define _FLEXFLOW_BATCH_MATMUL_H
 
 #include "op-attrs/ops/batch_matmul.h"
+#include "op_task_invocation.h"
+#include "op_task_signature.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
-#include "task_spec/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
similarity index 81%
rename from lib/runtime/src/ops/batch_norm.cc
rename to lib/local-execution/src/ops/batch_norm.cc
index a52981a8a3..dadfab14e0 100644
--- a/lib/runtime/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -15,17 +15,11 @@
 
 #include "batch_norm.h"
 #include "kernels/batch_norm_kernels.h"
-#include "legion/legion_utilities.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::BatchNorm;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots {
   INPUT,  // tensor
   SCALE,  // tensor
@@ -88,29 +82,19 @@ static DeviceSpecific<BatchNormPerDeviceState>
   float *runningMean;
 
   DeviceSpecific<BatchNormPerDeviceState> per_device_state =
-      acc.create_device_specific<BatchNormPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      runningMean,
-                      output_n,
-                      output_c,
-                      output_h,
-                      output_w,
-                      attrs.relu));
+      init_kernel(handle,
+                  allocator,
+                  runningMean,
+                  output_n,
+                  output_c,
+                  output_h,
+                  output_w,
+                  attrs.relu);
 
   return per_device_state;
 }
 
-static DeviceSpecific<BatchNormPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -123,22 +107,15 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(forward_kernel,
                  profiling,
                  "[BatchNorm] forward_time = %.2lfms\n",
-                 &per_device_state,
+                 per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  scale.get_float_ptr(),
                  bias.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -154,7 +131,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(backward_kernel,
                  profiling,
                  "[BatchNorm] backward_time = %.2lfms\n",
-                 &per_device_state,
+                 per_device_state,
                  input.get_float_ptr(),
                  output_grad.get_float_ptr(),
                  output.get_float_ptr(),
@@ -165,14 +142,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output.shape.get_volume());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   BatchNormAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -220,7 +189,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<BATCHNORM_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
   init.add_input_slot(INPUT);
   init.add_input_slot(BIAS);
   init.add_output_slot(OUTPUT);
@@ -236,12 +206,13 @@ void register_task<BATCHNORM_INIT_TASK_ID>() {
   register_task(BATCHNORM_INIT_TASK_ID,
                 "BatchNorm Init",
                 init_signature<BATCHNORM_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
 OpTaskSignature fwd_signature<BATCHNORM_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
   fwd.add_input_slot(SCALE);
@@ -258,7 +229,7 @@ void register_task<BATCHNORM_FWD_TASK_ID>() {
   register_task(BATCHNORM_FWD_TASK_ID,
                 "BatchNorm Fwd",
                 fwd_signature<BATCHNORM_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -274,7 +245,7 @@ void register_task<BATCHNORM_BWD_TASK_ID>() {
   register_task(BATCHNORM_BWD_TASK_ID,
                 "BatchNorm Bwd",
                 bwd_signature<BATCHNORM_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/batch_norm.h b/lib/local-execution/src/ops/batch_norm.h
similarity index 95%
rename from lib/runtime/src/ops/batch_norm.h
rename to lib/local-execution/src/ops/batch_norm.h
index 906e85a57c..6fae871c2c 100644
--- a/lib/runtime/src/ops/batch_norm.h
+++ b/lib/local-execution/src/ops/batch_norm.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_BATCH_NORM_H
 
 #include "op-attrs/ops/batch_norm.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
similarity index 80%
rename from lib/runtime/src/ops/cast.cc
rename to lib/local-execution/src/ops/cast.cc
index 44230eaf46..0914ea40a6 100644
--- a/lib/runtime/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -15,17 +15,12 @@
 
 #include "cast.h"
 #include "kernels/cast_kernels.h"
-#include "legion/legion_utilities.h"
-#include "task_spec/op_task_signature.h"
+
+#include "op_task_signature.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Cast;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 namespace FlexFlow {
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
@@ -48,7 +43,7 @@ OpTaskInvocation backward(CastAttrs const &attrs) {
   return {CAST_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
 
@@ -64,15 +59,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.dtype);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
 
@@ -90,14 +78,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.dtype);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   CastAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -127,7 +107,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<CAST_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<CastAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
@@ -143,7 +124,7 @@ void register_task<CAST_FWD_TASK_ID>() {
   register_task(CAST_FWD_TASK_ID,
                 "Cast Fwd",
                 fwd_signature<CAST_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -158,7 +139,7 @@ void register_task<CAST_BWD_TASK_ID>() {
   register_task(CAST_BWD_TASK_ID,
                 "Cast Bwd",
                 bwd_signature<CAST_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/cast.h b/lib/local-execution/src/ops/cast.h
similarity index 97%
rename from lib/runtime/src/ops/cast.h
rename to lib/local-execution/src/ops/cast.h
index c0c500e869..ce9a93aa32 100644
--- a/lib/runtime/src/ops/cast.h
+++ b/lib/local-execution/src/ops/cast.h
@@ -16,8 +16,8 @@
 #define _FLEXFLOW_CAST_H
 
 #include "op-attrs/ops/cast.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
similarity index 77%
rename from lib/runtime/src/ops/combine.cc
rename to lib/local-execution/src/ops/combine.cc
index 46d5ebb4fe..942d964021 100644
--- a/lib/runtime/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -15,15 +15,11 @@
 
 #include "combine.h"
 #include "kernels/combine_kernels.h"
-#include "task_spec/op_task_invocation.h"
+#include "op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 using namespace FlexFlow::Kernels::Combine;
 
@@ -46,7 +42,7 @@ OpTaskInvocation backward(CombineAttrs const &attrs) {
   return {COMBINE_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -59,15 +55,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
@@ -80,14 +69,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   CombineAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -103,7 +84,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<COMBINE_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUT);
@@ -117,7 +99,7 @@ void register_task<COMBINE_FWD_TASK_ID>() {
   register_task(COMBINE_FWD_TASK_ID,
                 "Combine Fwd",
                 fwd_signature<COMBINE_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -133,7 +115,7 @@ void register_task<COMBINE_BWD_TASK_ID>() {
   register_task(COMBINE_BWD_TASK_ID,
                 "Combine Bwd",
                 bwd_signature<COMBINE_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/combine.h b/lib/local-execution/src/ops/combine.h
similarity index 94%
rename from lib/runtime/src/ops/combine.h
rename to lib/local-execution/src/ops/combine.h
index 6b3a43863b..5923e9ebcc 100644
--- a/lib/runtime/src/ops/combine.h
+++ b/lib/local-execution/src/ops/combine.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_COMBINE_H
 
 #include "op-attrs/ops/combine.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
similarity index 79%
rename from lib/runtime/src/ops/concat.cc
rename to lib/local-execution/src/ops/concat.cc
index 1ce549cc57..3d62c19f20 100644
--- a/lib/runtime/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -15,21 +15,16 @@
 
 #include "concat.h"
 #include "kernels/concat_kernels.h"
-#include "legion/legion_utilities.h"
+
 #include "op-attrs/get_output_shapes.h"
-#include "task_spec/op_task_signature.h"
-#include "task_spec/variadic_tensor_ref.h"
+#include "op_task_signature.h"
 #include "utils/hash-utils.h"
+#include "variadic_tensor_ref.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Concat;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS };
 
 OpTaskInvocation forward(ConcatAttrs const &attrs) {
@@ -48,7 +43,7 @@ OpTaskInvocation backward(ConcatAttrs const &attrs) {
   return {CONCAT_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
 
@@ -65,15 +60,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.axis);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
 
@@ -90,14 +78,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.axis);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics
     measure_operator_cost(SimEnvFactory const &sim,
                           ConcatAttrs const &attrs,
@@ -131,7 +111,8 @@ CostMetrics
 
 template <>
 OpTaskSignature fwd_signature<CONCAT_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
   fwd.add_arg_slot<ConcatAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUTS, SlotType::VARIADIC);
@@ -145,13 +126,13 @@ void register_task<CONCAT_FWD_TASK_ID>() {
   register_task(CONCAT_FWD_TASK_ID,
                 "Concat Fwd",
                 fwd_signature<CONCAT_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
 OpTaskSignature bwd_signature<CONCAT_BWD_TASK_ID>() {
   OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(CONCAT_FWD_TASK_ID));
+      infer_bwd_signature(fwd_signature<CONCAT_FWD_TASK_ID>());
 
   return bwd;
 }
@@ -161,7 +142,7 @@ void register_task<CONCAT_BWD_TASK_ID>() {
   register_task(CONCAT_BWD_TASK_ID,
                 "Concat Bwd",
                 bwd_signature<CONCAT_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/concat.h b/lib/local-execution/src/ops/concat.h
similarity index 95%
rename from lib/runtime/src/ops/concat.h
rename to lib/local-execution/src/ops/concat.h
index 27dec47743..d0a432e8b3 100644
--- a/lib/runtime/src/ops/concat.h
+++ b/lib/local-execution/src/ops/concat.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_CONCAT_H
 
 #include "op-attrs/ops/concat.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
similarity index 78%
rename from lib/runtime/src/ops/conv_2d.cc
rename to lib/local-execution/src/ops/conv_2d.cc
index 01d8abab55..0df15e9b23 100644
--- a/lib/runtime/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -1,17 +1,10 @@
 #include "conv_2d.h"
 #include "kernels/conv_2d_kernels.h"
-#include "legion/legion_utilities.h"
-#include "mpark/variant.hpp"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::Conv2D;
 
 enum Slots {
@@ -70,33 +63,23 @@ static DeviceSpecific<Conv2DPerDeviceState>
   auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
 
   DeviceSpecific<Conv2DPerDeviceState> per_device_state =
-      acc.create_device_specific<Conv2DPerDeviceState>(
-          init_kernel(handle,
-                      attrs.activation,
-                      attrs.kernel_h,
-                      attrs.kernel_w,
-                      attrs.groups,
-                      attrs.padding_h,
-                      attrs.padding_w,
-                      attrs.stride_h,
-                      attrs.stride_w,
-                      input,
-                      output,
-                      filter.get_float_ptr(),
-                      filter_grad.get_float_ptr()));
+      init_kernel(handle,
+                  attrs.activation,
+                  attrs.kernel_h,
+                  attrs.kernel_w,
+                  attrs.groups,
+                  attrs.padding_h,
+                  attrs.padding_w,
+                  attrs.stride_h,
+                  attrs.stride_w,
+                  input,
+                  output,
+                  filter.get_float_ptr(),
+                  filter_grad.get_float_ptr());
   return per_device_state;
 }
 
-static DeviceSpecific<Conv2DPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
@@ -118,15 +101,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.activation);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
@@ -155,14 +131,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.activation);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   Conv2DAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -210,7 +178,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<CONV2D_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_input_slot(INPUT);
   init.add_output_slot(OUTPUT);
@@ -228,12 +197,13 @@ void register_task<CONV2D_INIT_TASK_ID>() {
   register_task(CONV2D_INIT_TASK_ID,
                 "Conv2d Init",
                 init_signature<CONV2D_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
 OpTaskSignature fwd_signature<CONV2D_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_unchecked_arg_slot<Conv2DPerDeviceState>(PER_DEVICE_STATE);
@@ -252,7 +222,7 @@ void register_task<CONV2D_FWD_TASK_ID>() {
   register_task(CONV2D_FWD_TASK_ID,
                 "Conv2d Fwd",
                 fwd_signature<CONV2D_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -268,7 +238,7 @@ void register_task<CONV2D_BWD_TASK_ID>() {
   register_task(CONV2D_BWD_TASK_ID,
                 "Conv2d Bwd",
                 bwd_signature<CONV2D_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/local-execution/src/ops/conv_2d.h
similarity index 95%
rename from lib/runtime/src/ops/conv_2d.h
rename to lib/local-execution/src/ops/conv_2d.h
index 7225099a47..0e92b00553 100644
--- a/lib/runtime/src/ops/conv_2d.h
+++ b/lib/local-execution/src/ops/conv_2d.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_CONV_2D_H
 
 #include "op-attrs/ops/conv_2d.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
similarity index 77%
rename from lib/runtime/src/ops/dropout.cc
rename to lib/local-execution/src/ops/dropout.cc
index fe85afea38..236b7e2c88 100644
--- a/lib/runtime/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -1,18 +1,12 @@
 #include "dropout.h"
 #include "kernels/dropout_kernels.h"
-#include "legion/legion_utilities.h"
 #include "op-attrs/get_output_shapes.h"
-#include "task_spec/op_task_invocation.h"
-#include "task_spec/task_signature.h"
+#include "op_task_invocation.h"
+#include "op_task_signature.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::Dropout;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING };
@@ -54,21 +48,11 @@ static DeviceSpecific<DropoutPerDeviceState>
   auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
 
   DeviceSpecific<DropoutPerDeviceState> per_device_state =
-      acc.create_device_specific<DropoutPerDeviceState>(
-          init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator));
+      init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator);
   return per_device_state;
 }
 
-static DeviceSpecific<DropoutPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -83,15 +67,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
   auto per_device_state =
       acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
@@ -108,14 +85,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  input_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   DropoutAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -155,7 +124,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<DROPOUT_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
   init.add_arg_slot<DropoutAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(FF_HANDLE);
   init.add_output_slot(OUTPUT);
@@ -170,12 +140,13 @@ void register_task<DROPOUT_INIT_TASK_ID>() {
   register_task(DROPOUT_INIT_TASK_ID,
                 "Dropout Init",
                 init_signature<DROPOUT_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
 OpTaskSignature fwd_signature<DROPOUT_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_unchecked_arg_slot<DropoutPerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
@@ -191,7 +162,7 @@ void register_task<DROPOUT_FWD_TASK_ID>() {
   register_task(DROPOUT_FWD_TASK_ID,
                 "Dropout Fwd",
                 fwd_signature<DROPOUT_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -207,7 +178,7 @@ void register_task<DROPOUT_BWD_TASK_ID>() {
   register_task(DROPOUT_BWD_TASK_ID,
                 "Dropout Bwd",
                 bwd_signature<DROPOUT_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/dropout.h b/lib/local-execution/src/ops/dropout.h
similarity index 95%
rename from lib/runtime/src/ops/dropout.h
rename to lib/local-execution/src/ops/dropout.h
index 88a255d140..4f22842c8a 100644
--- a/lib/runtime/src/ops/dropout.h
+++ b/lib/local-execution/src/ops/dropout.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_DROPOUT_H
 
 #include "op-attrs/ops/dropout.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 #include "tasks.h"
 
 namespace FlexFlow {
diff --git a/lib/runtime/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc
similarity index 81%
rename from lib/runtime/src/ops/element_binary.cc
rename to lib/local-execution/src/ops/element_binary.cc
index f6be2198ca..0cec2b8d0a 100644
--- a/lib/runtime/src/ops/element_binary.cc
+++ b/lib/local-execution/src/ops/element_binary.cc
@@ -1,16 +1,11 @@
 #include "element_binary.h"
 #include "kernels/element_binary_kernels.h"
-#include "legion/legion_utilities.h"
+
 #include "op-attrs/get_output_shapes.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::ElementBinary;
 
 enum Slots {
@@ -66,27 +61,17 @@ static DeviceSpecific<ElementBinaryPerDeviceState>
   auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
 
   DeviceSpecific<ElementBinaryPerDeviceState> per_device_state =
-      acc.create_device_specific<ElementBinaryPerDeviceState>(
-          init_kernel(handle,
-                      attrs.type,
-                      attrs.should_broadcast_lhs,
-                      attrs.should_broadcast_rhs,
-                      input_lhs.shape,
-                      input_rhs.shape,
-                      output.shape));
+      init_kernel(handle,
+                  attrs.type,
+                  attrs.should_broadcast_lhs,
+                  attrs.should_broadcast_rhs,
+                  input_lhs.shape,
+                  input_rhs.shape,
+                  output.shape);
   return per_device_state;
 }
 
-static DeviceSpecific<ElementBinaryPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
@@ -109,15 +94,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  handle);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -146,14 +124,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  handle);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics
     measure_operator_cost(SimEnvFactory const &sim,
                           ElementBinaryAttrs const &attrs,
@@ -203,7 +173,8 @@ CostMetrics
 
 template <>
 OpTaskSignature init_signature<ELEMENTBINARY_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_input_slot(LHS_INPUT);
   init.add_input_slot(RHS_INPUT);
@@ -221,12 +192,13 @@ void register_task<ELEMENTBINARY_INIT_TASK_ID>() {
   register_task(ELEMENTBINARY_INIT_TASK_ID,
                 "ElementBinary Init",
                 init_signature<ELEMENTBINARY_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
 OpTaskSignature fwd_signature<ELEMENTBINARY_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
@@ -245,7 +217,7 @@ void register_task<ELEMENTBINARY_FWD_TASK_ID>() {
   register_task(ELEMENTBINARY_FWD_TASK_ID,
                 "ElementBinary Fwd",
                 fwd_signature<ELEMENTBINARY_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -261,7 +233,7 @@ void register_task<ELEMENTBINARY_BWD_TASK_ID>() {
   register_task(ELEMENTBINARY_BWD_TASK_ID,
                 "ElementBinary Bwd",
                 bwd_signature<ELEMENTBINARY_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/element_binary.h b/lib/local-execution/src/ops/element_binary.h
similarity index 100%
rename from lib/runtime/src/ops/element_binary.h
rename to lib/local-execution/src/ops/element_binary.h
diff --git a/lib/runtime/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
similarity index 77%
rename from lib/runtime/src/ops/element_unary.cc
rename to lib/local-execution/src/ops/element_unary.cc
index f41a8b3551..9567fc1570 100644
--- a/lib/runtime/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -1,15 +1,11 @@
 #include "element_unary.h"
 #include "kernels/element_unary_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
 // declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 using namespace FlexFlow::Kernels::ElementUnary;
 
@@ -27,7 +23,6 @@ enum Slots {
 OpTaskInvocation init(ElementUnaryUnifiedAttrs const &attrs) {
   OpTaskBinding b;
 
-  b.bind_arg(HANDLE, ff_handle());
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0));
 
@@ -58,32 +53,21 @@ static DeviceSpecific<ElementUnaryPerDeviceState>
 
   auto const &attrs = acc.get_argument<ElementUnaryUnifiedAttrs>(ATTRS);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   ParallelTensorShape input_shape =
       acc.get_argument<ParallelTensorShape>(INPUT_SHAPE);
   ParallelTensorShape output_shape = get_output_shape(attrs, input_shape);
 
-  DeviceSpecific<ElementUnaryPerDeviceState> per_device_state =
-      acc.create_device_specific<ElementUnaryPerDeviceState>(
-          init_kernel(input_shape, output_shape, attrs));
+  DeviceSpecific<ElementUnaryPerDeviceState> per_device_state = init_kernel(
+      get_piece_shape(input_shape), get_piece_shape(output_shape), attrs);
   return per_device_state;
 }
 
-static DeviceSpecific<ElementUnaryPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<ElementUnaryUnifiedAttrs>(ATTRS);
 
-  auto &handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
@@ -99,22 +83,15 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryUnifiedAttrs>(ATTRS);
-  auto &handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   auto per_device_state =
       acc.get_argument<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
@@ -132,14 +109,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   ElementUnaryUnifiedAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -147,7 +116,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   MachineView const &mv) {
   auto env = sim.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shape(attrs, input_shape);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(HANDLE, ff_handle());
@@ -181,7 +150,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<ELEMENTUNARY_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
   init.add_arg_slot<ParallelTensorShape>(INPUT_SHAPE);
   init.add_arg_slot<ElementUnaryUnifiedAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
@@ -196,12 +166,13 @@ void register_task<ELEMENTUNARY_INIT_TASK_ID>() {
   register_task(ELEMENTUNARY_INIT_TASK_ID,
                 "ElementUnary Init",
                 init_signature<ELEMENTUNARY_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
 OpTaskSignature fwd_signature<ELEMENTUNARY_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
@@ -217,7 +188,7 @@ void register_task<ELEMENTUNARY_FWD_TASK_ID>() {
   register_task(ELEMENTUNARY_FWD_TASK_ID,
                 "ElementUnary Fwd",
                 fwd_signature<ELEMENTUNARY_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -233,7 +204,7 @@ void register_task<ELEMENTUNARY_BWD_TASK_ID>() {
   register_task(ELEMENTUNARY_BWD_TASK_ID,
                 "ElementUnary Bwd",
                 bwd_signature<ELEMENTUNARY_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/element_unary.h b/lib/local-execution/src/ops/element_unary.h
similarity index 91%
rename from lib/runtime/src/ops/element_unary.h
rename to lib/local-execution/src/ops/element_unary.h
index f44efc28db..83f6177b8d 100644
--- a/lib/runtime/src/ops/element_unary.h
+++ b/lib/local-execution/src/ops/element_unary.h
@@ -7,9 +7,6 @@
 
 namespace FlexFlow {
 
-using ElementUnaryUnifiedAttrs =
-    variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;
-
 template <>
 void register_task<ELEMENTUNARY_INIT_TASK_ID>();
 template <>
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc
similarity index 82%
rename from lib/runtime/src/ops/embedding.cc
rename to lib/local-execution/src/ops/embedding.cc
index a1bc915d2f..31dc83814f 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/local-execution/src/ops/embedding.cc
@@ -15,17 +15,11 @@
 
 #include "embedding.h"
 #include "kernels/embedding_kernels.h"
-#include "legion.h"
+#include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/embedding.h"
 
 namespace FlexFlow {
 
-// declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::Embedding;
 
 enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING };
@@ -49,7 +43,7 @@ OpTaskInvocation backward(EmbeddingAttrs const &attrs) {
   return {EMBED_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -71,15 +65,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  input.shape[legion_dim_t(1)]);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
@@ -98,15 +85,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
-                 input.shape[ff_dim_t(0)]);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 input.shape.at(ff_dim_t(0)));
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
@@ -141,7 +120,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<EMBED_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
   fwd.add_input_slot(OUTPUT);
@@ -158,7 +138,7 @@ void register_task<EMBED_FWD_TASK_ID>() {
   register_task(EMBED_FWD_TASK_ID,
                 "Embed Fwd",
                 fwd_signature<EMBED_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -172,7 +152,7 @@ void register_task<EMBED_BWD_TASK_ID>() {
   register_task(EMBED_BWD_TASK_ID,
                 "Embed Bwd",
                 bwd_signature<EMBED_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/embedding.h b/lib/local-execution/src/ops/embedding.h
similarity index 94%
rename from lib/runtime/src/ops/embedding.h
rename to lib/local-execution/src/ops/embedding.h
index cd1b14fa66..b4caebf952 100644
--- a/lib/runtime/src/ops/embedding.h
+++ b/lib/local-execution/src/ops/embedding.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_EMBEDDING_H
 
 #include "op-attrs/ops/embedding.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
similarity index 78%
rename from lib/runtime/src/ops/flat.cc
rename to lib/local-execution/src/ops/flat.cc
index f53a6185b6..45d3805e0c 100644
--- a/lib/runtime/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -5,10 +5,6 @@
 namespace FlexFlow {
 
 // declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 using namespace FlexFlow::Kernels::Flat;
 
@@ -30,7 +26,7 @@ OpTaskInvocation backward(FlatAttrs const &attrs) {
   return {FLAT_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -42,15 +38,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -65,14 +54,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   FlatAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -101,7 +82,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<FLAT_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_input_slot(INPUT);
@@ -115,7 +97,7 @@ void register_task<FLAT_FWD_TASK_ID>() {
   register_task(FLAT_FWD_TASK_ID,
                 "Flat Fwd",
                 fwd_signature<FLAT_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -130,7 +112,7 @@ void register_task<FLAT_BWD_TASK_ID>() {
   register_task(FLAT_BWD_TASK_ID,
                 "Flat Bwd",
                 bwd_signature<FLAT_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/flat.h b/lib/local-execution/src/ops/flat.h
similarity index 100%
rename from lib/runtime/src/ops/flat.h
rename to lib/local-execution/src/ops/flat.h
diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
similarity index 62%
rename from lib/runtime/src/ops/layer_norm.cc
rename to lib/local-execution/src/ops/layer_norm.cc
index 6bc671c249..3caf95c068 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -15,26 +15,27 @@
 
 #include "layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include <type_traits>
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 namespace FlexFlow {
 
+using namespace FlexFlow::Kernels::LayerNorm;
+
 enum Slots {
   PROFILING,
   INPUT,
+  INPUT_GRAD,
   OUTPUT,
+  OUTPUT_GRAD,
   GAMMA,
+  GAMMA_GRAD,
   BETA,
+  BETA_GRAD,
   PER_DEVICE_STATE,
   ATTRS,
   HANDLE
@@ -59,7 +60,7 @@ OpTaskInvocation forward(LayerNormAttrs const &attrs) {
   b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem
   b.bind(BETA, weight_tensor(1));  // how to get gmmam and beta
   b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE, per_device_state<LayerNormPerDeviceState>());
+  b.bind_arg(PER_DEVICE_STATE, per_device_op_state<LayerNormPerDeviceState>());
 
   return {LAYERNORM_FWD_TASK_ID, b};
 }
@@ -70,11 +71,11 @@ OpTaskInvocation backward(LayerNormAttrs const &attrs) {
   return {LAYERNORM_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
-  auto gamma = acc.get_tensor<Permission::RW>(GAMMA);
-  auto beta = acc.get_tensor<Permission::RW>(BETA);
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto gamma = acc.get_tensor<Permissions::RW>(GAMMA);
+  auto beta = acc.get_tensor<Permissions::RW>(BETA);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
@@ -83,28 +84,21 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[LayerNorm] forward time = %.2lfms\n",
                  state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 gamma.get_float_ptr(),
-                 beta.get_float_ptr());
+                 input,
+                 output,
+                 gamma,
+                 beta);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto gamma = acc.get_tensor<Permissions::RO>(GAMMA);
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto gamma = acc.get_tensor<Permission::RO>(GAMMA);
-
-  auto input_grad = acc.get_tensor<Permission::RW>(INPUT_GRAD);
-  auto gamma_grad = acc.get_tensor<Permission::RW>(GAMMA_GRAD);
-  auto beta_grad = acc.get_tensor<Permission::RW>(BETA_GRAD);
-  auto output_grad = acc.get_tensor<Permission::RO>(OUTPUT_GRAD);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT_GRAD);
+  auto gamma_grad = acc.get_tensor<Permissions::RW>(GAMMA_GRAD);
+  auto beta_grad = acc.get_tensor<Permissions::RW>(BETA_GRAD);
+  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT_GRAD);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
@@ -113,28 +107,20 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[LayerNorm] backward time = %.2lfms\n",
                  state,
-                 output_grad.get_float_ptr(),
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 gamma.get_float_ptr(),
-                 gamma_grad.get_float_ptr(),
-                 beta_grad.get_float_ptr());
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 output_grad,
+                 input,
+                 input_grad,
+                 gamma,
+                 gamma_grad,
+                 beta_grad);
 }
 
 static DeviceSpecific<LayerNormPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
+  auto const &attrs = acc.get_argument<LayerNormAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  FFHandler handle = acc.get_argument<FFHandler>(HANDLE);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   // question: how to get batch_size and effective_num_elements
   int64_t effective_batch_size, effective_num_elements;
@@ -143,48 +129,39 @@ static DeviceSpecific<LayerNormPerDeviceState>
     M *= input.shape.at(legion_dim_t(attrs.axes[i]));
   }
   int num_replicas = 1;
-  for (int i = 0; i < intput.shape.num_dims(); i++) {
+  for (int i = 0; i < input.shape.num_dims(); i++) {
     num_replicas *= input.shape.at(legion_dim_t(i));
     effective_num_elements = M;
     effective_batch_size = input.shape.get_volume() / M;
 
     DeviceSpecific<LayerNormPerDeviceState> per_device_state =
-        acc.create_device_specific<LayerNormPerDeviceState>(
-            init_kernel(handle,
-                        allocator,
-                        attrs.elementwise_affine,
-                        effective_batch_size,
-                        effective_num_elements,
-                        attrs.eps));
+        init_kernel(handle,
+                    allocator,
+                    attrs.elementwise_affine,
+                    effective_batch_size,
+                    effective_num_elements,
+                    attrs.eps);
   }
 }
 
-static DeviceSpecific<LayerNormPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   LayerNormAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(HANDLE, ff_handle());
   init_binding.bind_arg(ATTRS, attrs);
-  init.binding.bind(INPUT, input.shape);
+  init_binding.bind(INPUT, input.shape);
 
   auto init_accessor =
       env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
 
-  DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
+  DeviceSpecific<LayerNormPerDeviceState> per_device_state =
+      init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind(INPUT, input.shape);
@@ -193,8 +170,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
   // TODO how to handle gamma and beta, where are they from
-  fwd_binding.bind(GAMMA, input_shape);
-  fwd_binding.bind(BETA, input_shape);
+  fwd_binding.bind(GAMMA, input.shape);
+  fwd_binding.bind(BETA, input.shape);
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
   auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);
@@ -209,7 +186,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
@@ -222,7 +200,7 @@ OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
 }
 
 template <>
-OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>() {
+OpTaskSignature bwd_signature<LAYERNORM_BWD_TASK_ID>() {
   OpTaskSignature bwd =
       infer_bwd_signature(fwd_signature<LAYERNORM_FWD_TASK_ID>());
   return bwd;
@@ -230,7 +208,8 @@ OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>() {
 
 template <>
 OpTaskSignature init_signature<LAYERNORM_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
@@ -245,7 +224,7 @@ void register_task<LAYERNORM_INIT_TASK_ID>() {
   register_task(LAYERNORM_INIT_TASK_ID,
                 "LayerNorm init",
                 init_signature<LAYERNORM_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -253,15 +232,15 @@ void register_task<LAYERNORM_FWD_TASK_ID>() {
   register_task(LAYERNORM_FWD_TASK_ID,
                 "LayerNorm forward",
                 fwd_signature<LAYERNORM_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
 void register_task<LAYERNORM_BWD_TASK_ID>() {
   register_task(LAYERNORM_BWD_TASK_ID,
                 "LayerNorm backward",
-                bwd_signature<AYERNORM_BWD_TASK_ID>(),
-                backward_task);
+                bwd_signature<LAYERNORM_BWD_TASK_ID>(),
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/local-execution/src/ops/layer_norm.h
similarity index 100%
rename from lib/runtime/src/ops/layer_norm.h
rename to lib/local-execution/src/ops/layer_norm.h
diff --git a/lib/runtime/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
similarity index 55%
rename from lib/runtime/src/ops/linear.cc
rename to lib/local-execution/src/ops/linear.cc
index 96d037913c..2d13909c09 100644
--- a/lib/runtime/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -1,32 +1,14 @@
 #include "linear.h"
 #include "kernels/linear_kernels.h"
-#include "layer.h"
-#include "legion/legion_utilities.h"
 #include "op-attrs/ff_dim.h"
 #include "op-attrs/get_output_shapes.h"
-#include "utils/exceptions.h"
+#include "task_argument_accessor.h"
+#include "utils/exception.h"
 #include "utils/graph/views.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
 using namespace FlexFlow::Kernels::Linear;
 
 enum slots {
@@ -43,12 +25,12 @@ enum slots {
 OpTaskInvocation init(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind_arg(HANDLE, ff_handle());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(ATTRS, attrs);
 
-  bind.bind(INPUT, input_tensor(0));   // input
-  bind.bind(WEIGHT, weight_tensor(0)); // weight
-  bind.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));   // input
+  binding.bind(WEIGHT, weight_tensor(0)); // weight
+  binding.bind(OUTPUT, output_tensor(0)); // output
 
   return {LINEAR_INIT_TASK_ID, binding};
 }
@@ -56,14 +38,17 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
 OpTaskInvocation forward(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind(INPUT, input_tensor(0));   // input
-  bind.bind(WEIGHT, weight_tensor(0)); // weight
-  bind.bind(OUTPUT, output_tensor(0)); // output
-  bind.bind(BIAS, bias_tensor(0));     // bias
+  binding.bind(INPUT, input_tensor(0));   // input
+  binding.bind(WEIGHT, weight_tensor(0)); // weight
+  binding.bind(OUTPUT, output_tensor(0)); // output
+  if (attrs.use_bias) {
+    binding.bind(BIAS, weight_tensor(1)); // bias
+  }
 
-  bing.bind_arg(PROFILING, profiling_settings());
-  bind.bind_arg(PER_DEVICE_STATE, per_device_state<LinearPerDeviceState>());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<LinearPerDeviceState>());
+  binding.bind_arg(ATTRS, attrs);
 
   return {LINEAR_FWD_TASK_ID, binding};
 }
@@ -74,51 +59,38 @@ OpTaskInvocation backward(LinearAttrs const &attrs) {
   return {LINEAR_BWD_TASK_ID, b};
 }
 
-static DeviceSpecific<LinearPerDeviceState>
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
-  Allocator allocator = acc.get_allocator();
+static LinearPerDeviceState init_task_impl(TaskArgumentAccessor const &acc) {
+  auto const &attrs = acc.get_argument<LinearAttrs>(ATTRS);
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   int out_dim = output.shape.at(ff_dim_t{0});
-  int batch_size = output.shape.at.(ff_dim_t{1});
+  int batch_size = output.shape.at(ff_dim_t{1});
 
   float *one_ptr;
 
-  DeviceSpecific<LinearPerDeviceState> state =
-      acc.create_device_specific<LinearPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      one_ptr,
-                      attrs.regularizer,
-                      attrs.use_bias,
-                      input.data_type,
-                      weight.data_type,
-                      output.data_type,
-                      batch_size,
-                      attrs.out_channels));
+  LinearPerDeviceState state = init_kernel(handle,
+                                           one_ptr,
+                                           attrs.regularizer,
+                                           attrs.use_bias,
+                                           input.data_type,
+                                           weight.data_type,
+                                           output.data_type,
+                                           batch_size,
+                                           attrs.out_channels);
   return state;
 }
 
-static DeviceSpecific<MHAPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto bias = acc.get_tensor<Permissions::RO>(BIAS);
 
-  auto state = acc.get_device_specific<LinearPerDeviceState>(PER_DEVICE_STATE);
+  auto per_device_state =
+      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
@@ -144,15 +116,10 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  batch_size);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-};
+;
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -161,7 +128,8 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto per_device_state = acc.get_argument<MHAPerDeviceState>(PER_DEVICE_STATE);
+  auto per_device_state =
+      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
@@ -178,63 +146,61 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[Linear] backward_time = %.2lfms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 output.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 weight.get_float_ptr(),
-                 weight_grad.get_float_ptr(),
-                 bias_ptr,
+                 (void *)input.get_float_ptr(),
+                 (void *)input_grad.get_float_ptr(),
+                 (void *)output.get_float_ptr(),
+                 (void *)output_grad.get_float_ptr(),
+                 (void *)weight.get_float_ptr(),
+                 (void *)weight_grad.get_float_ptr(),
+                 (void *)bias_ptr,
                  in_dim,
                  out_dim,
                  batch_size);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   LinearAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shape(input.shape, attrs);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+  ParallelTensorShape weight_shape = get_weights_shape(attrs, input.shape);
+  ParallelTensorShape bias_shape = get_bias_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
-  init_binding.bind(INPUT, input_tensor(0));
-  init_binding.bind(WEIGHT, weight_tensor(0));
-  init_binding.bind(BIAS, bias_tensor(0));
-  init_binding.bind(OUTPUT, output_tensor(0));
+  init_binding.bind(INPUT, input.shape);
+  init_binding.bind(WEIGHT, weight_shape);
+  if (attrs.use_bias) {
+    init_binding.bind(BIAS, bias_shape);
+  }
+  init_binding.bind(OUTPUT, output_shape);
   init_binding.bind_arg(ATTRS, attrs);
   init_binding.bind_arg(HANDLE, ff_handle());
 
   auto init_accessor = env.get_init_accessor(LINEAR_INIT_TASK_ID, init_binding);
 
-  DeviceSpecific<LinearPerDeviceState> per_device_state =
-      init_task_impl(init_accessor);
+  LinearPerDeviceState per_device_state = init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
 
-  fwd_bind.bind(INPUT, input_tensor(0));   // input
-  fwd_bind.bind(WEIGHT, weight_tensor(0)); // weight
-  fwd_bind.bind(OUTPUT, output_tensor(0)); // output
-  fwd_bind.bind(BIAS, bias_tensor(0));     // bias
+  fwd_binding.bind(INPUT, input.shape);   // input
+  fwd_binding.bind(WEIGHT, weight_shape); // weight
+  fwd_binding.bind(OUTPUT, output_shape); // output
+  if (attrs.use_bias) {
+    fwd_binding.bind(BIAS, bias_shape); // bias
+  }
 
-  fwd_bid.bind_arg(PROFILING, profiling_settings());
-  fwd_bind.bind_arg(PER_DEVICE_STATE, per_device_state<LinearPerDeviceState>());
-  fwd_bind.bind_arg(ATTRS, attrs);
+  fwd_binding.bind_arg(PROFILING, profiling_settings());
+  fwd_binding.bind_arg(PER_DEVICE_STATE,
+                       per_device_op_state<LinearPerDeviceState>());
+  fwd_binding.bind_arg(ATTRS, attrs);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto fwd_accessor = env.get_accessor(LINEAR_FWD_TASK_ID, fwd_binding);
-  auto bwd_accessor = env.get_accessor(LINEAR_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(LINEAR_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(LINEAR_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();
@@ -245,27 +211,28 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 OpTaskSignature init_signature<LINEAR_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_input_slot(INPUT);
-  init.add_input_slot(WEIGHT);
-  init.add_input_slot(BIAS);
+  init.add_weight_slot(WEIGHT);
   init.add_output_slot(OUTPUT);
 
   init.add_arg_slot<LinearAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
   init.add_return_value<LinearPerDeviceState>();
-  return init,
+  return init;
 }
 
 template <>
 OpTaskSignature fwd_signature<LINEAR_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
-  fwd.add_input_slot(WEIGHT);
-  fwd.add_input_slot(BIAS);
+  fwd.add_weight_slot(WEIGHT);
+  fwd.add_optional_weight_slot(BIAS);
   fwd.add_output_slot(OUTPUT);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
@@ -281,13 +248,28 @@ OpTaskSignature bwd_signature<LINEAR_BWD_TASK_ID>() {
   return bwd;
 }
 
+template <>
+TaskImplFunction get_task_impl<LINEAR_INIT_TASK_ID>() {
+  return init_task_impl;
+}
+
+template <>
+TaskImplFunction get_task_impl<LINEAR_FWD_TASK_ID>() {
+  return forward_task_impl;
+}
+
+template <>
+TaskImplFunction get_task_impl<LINEAR_BWD_TASK_ID>() {
+  return backward_task_impl;
+}
+
 template <>
 void register_task<LINEAR_INIT_TASK_ID>() {
 
   register_task(LINEAR_INIT_TASK_ID,
                 "Linear::init_task",
                 init_signature<LINEAR_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -295,7 +277,7 @@ void register_task<LINEAR_FWD_TASK_ID>() {
   register_task(LINEAR_FWD_TASK_ID,
                 "Linear::fwd_task",
                 fwd_signature<LINEAR_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -303,7 +285,11 @@ void register_task<LINEAR_BWD_TASK_ID>() {
   register_task(LINEAR_BWD_TASK_ID,
                 "Linear::bwd_task",
                 bwd_signature<LINEAR_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
+}
+
+std::vector<task_id_t> get_task_ids(LinearAttrs const &) {
+  return {LINEAR_INIT_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID};
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/linear.h b/lib/local-execution/src/ops/linear.h
similarity index 100%
rename from lib/runtime/src/ops/linear.h
rename to lib/local-execution/src/ops/linear.h
diff --git a/lib/runtime/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc
similarity index 96%
rename from lib/runtime/src/ops/noop.cc
rename to lib/local-execution/src/ops/noop.cc
index 6b8510607a..02ffeaf111 100644
--- a/lib/runtime/src/ops/noop.cc
+++ b/lib/local-execution/src/ops/noop.cc
@@ -14,7 +14,7 @@
  */
 
 #include "noop.h"
-#include "task_spec/op_task_invocation.h"
+#include "op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/runtime/src/ops/noop.h b/lib/local-execution/src/ops/noop.h
similarity index 89%
rename from lib/runtime/src/ops/noop.h
rename to lib/local-execution/src/ops/noop.h
index f5cf6cc98c..17a9426e77 100644
--- a/lib/runtime/src/ops/noop.h
+++ b/lib/local-execution/src/ops/noop.h
@@ -3,7 +3,7 @@
 
 #include "op-attrs/ops/input.h"
 #include "op-attrs/ops/noop.h"
-#include "task_spec/op_task_invocation.h"
+#include "op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/parallel_op.h b/lib/local-execution/src/ops/parallel_op.h
similarity index 96%
rename from lib/runtime/src/ops/parallel_op.h
rename to lib/local-execution/src/ops/parallel_op.h
index 6b596a4fb5..e7bd98b8a8 100644
--- a/lib/runtime/src/ops/parallel_op.h
+++ b/lib/local-execution/src/ops/parallel_op.h
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 struct ParallelOpJoinResult {
-  optional<ParallelOpInfo> op = nullopt;
+  std::optional<ParallelOpInfo> op = std::nullopt;
   bool join_did_succeed = false;
 };
 
diff --git a/lib/runtime/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc
similarity index 59%
rename from lib/runtime/src/ops/partition.cc
rename to lib/local-execution/src/ops/partition.cc
index 2a974e96da..c6e5bce64d 100644
--- a/lib/runtime/src/ops/partition.cc
+++ b/lib/local-execution/src/ops/partition.cc
@@ -13,32 +13,13 @@
  * limitations under the License.
  */
 
-#include "parallel_ops/partition.h"
 #include "kernels/partition_kernels.h"
-#include "op-attrs/get_output_shape.h"
-#include "utils/exceptions.h"
+#include "op-attrs/get_output_shapes.h"
+#include "repartition.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Repartition;
 
@@ -59,7 +40,7 @@ OpTaskInvocation forward(RepartitionAttrs const &attrs) {
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_state<RepartitionPerDeviceState>());
+                   per_device_op_state<RepartitionPerDeviceState>());
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
 
@@ -79,64 +60,39 @@ static DeviceSpecific<RepartitionPerDeviceState>
 
   // Note: use the input data type
   DeviceSpecific<RepartitionPerDeviceState> per_device_state =
-      acc.create_device_specific_state<RepartitionPerDeviceState>(
-          init_kernel(handle, input.data_type));
+      init_kernel(handle, input.data_type);
   return per_device_state;
 }
 
-static DeviceSpecific<RepartitionPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  return profiling(forward,
-                   profiling,
-                   "[Reparition/Partition] forward_time = %.2lfms\n",
-                   per_device_state,
-                   input,
-                   output);
-}
-
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
+  return profile(forward_kernel,
+                 profiling,
+                 "[Reparition/Partition] forward_time = %.2lfms\n",
+                 per_device_state,
+                 input,
+                 output);
 }
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
   auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
-  return profiling(backward,
-                   profiling,
-                   "[Reparition/Partition] backward_time = %.2lfms\n",
-                   per_device_state,
-                   input_grad,
-                   output_grad);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reparition/Partition] backward_time = %.2lfms\n",
+                 per_device_state,
+                 output_grad,
+                 input_grad);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -144,7 +100,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
@@ -165,8 +121,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto fwd_accessor = env.get_accessor(REPARTITION_FWD_TASK_ID, fwd_binding);
-  auto bwd_accessor = env.get_accessor(REPARTITION_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor =
+      env.get_fwd_accessor(REPARTITION_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor =
+      env.get_bwd_accessor(REPARTITION_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();
@@ -177,7 +135,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REPARTITION_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
@@ -185,27 +144,33 @@ void register_task<REPARTITION_INIT_TASK_ID>() {
 
   init.add_return_value<RepartitionPerDeviceState>();
 
-  register_task(REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task);
+  register_task(
+      REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task_impl);
 }
 
 template <>
 void register_task<REPARTITION_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<RepartitionPerDeviceState>(PER_DEVICE_STATE);
 
-  register_task(REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task);
+  register_task(
+      REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REPARTITION_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<REPARTITION_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID));
+
+//   register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
similarity index 58%
rename from lib/runtime/src/ops/pool_2d.cc
rename to lib/local-execution/src/ops/pool_2d.cc
index 577837c960..32bc5d1616 100644
--- a/lib/runtime/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -1,10 +1,10 @@
 #include "pool_2d.h"
 #include "kernels/pool_2d_kernels.h"
-#include "legion/legion_utilities.h"
+
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/pool_2d.h"
 #include "utils/exception.decl.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Pool2D;
@@ -23,13 +23,13 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) {
   return {POOL2D_INIT_TASK_ID, binding};
 }
 
-static DeviceSpecific<Pool2dPerDeviceState>
+static DeviceSpecific<Pool2DPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<Pool2DAttrs>(ATTRS);
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   int input_w = input.shape.at(ff_dim_t(0)) + 1;
   int input_h = input.shape.at(ff_dim_t(1)) + 1;
@@ -64,37 +64,27 @@ static DeviceSpecific<Pool2dPerDeviceState>
     printf("Warning: changing pool_padding_w to satisfy output_w size\n");
   }
 
-  DeviceSpecific<Pool2dPerDeviceState> state = acc.create_device_specific<Pool2dPerDeviceState>(
-              init_kernel(handle,
-                          attrs.activation,
-                          input_w,
-                          input_h,
-                          input_c,
-                          input_n,
-                          output_w,
-                          output_h,
-                          output_c,
-                          output_n,
-                          pad_h,
-                          pad_w,
-                          attrs.kernel_h,
-                          attrs.kernel_w,
-                          attrs.stride_h,
-                          attrs.stride_w,
-                          attrs.pool_type);
+  DeviceSpecific<Pool2DPerDeviceState> state = init_kernel(handle,
+                                                           attrs.activation,
+                                                           input_w,
+                                                           input_h,
+                                                           input_c,
+                                                           input_n,
+                                                           output_w,
+                                                           output_h,
+                                                           output_c,
+                                                           output_n,
+                                                           pad_h,
+                                                           pad_w,
+                                                           attrs.kernel_h,
+                                                           attrs.kernel_w,
+                                                           attrs.stride_h,
+                                                           attrs.stride_w,
+                                                           attrs.pool_type);
 
   return state;
 }
 
-static DeviceSpecific<Pool2dPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
 OpTaskInvocation forward(Pool2DAttrs const &attrs) {
   OpTaskBinding binding;
   binding.bind(INPUT, input_tensor(0));
@@ -102,53 +92,46 @@ OpTaskInvocation forward(Pool2DAttrs const &attrs) {
 
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<Pool2dPerDeviceState>());
+                   per_device_op_state<Pool2DPerDeviceState>());
 
   return {POOL2D_FWD_TASK_ID, binding};
 }
 
-OpTaskInvocation backward(Pool2DAttrs const &) {
+OpTaskInvocation backward(Pool2DAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
   return {POOL2D_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  Pool2dPerDeviceState state =
-      acc.get_argument<Pool2dPerDeviceState>(PER_DEVICE_STATE);
+  Pool2DPerDeviceState state =
+      acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
-                 profilng,
+                 profiling,
                  "[Pool2D] forward_time = %.2lfms\n",
                  state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  Pool2dPerDeviceState state =
-      acc.get_argument<Pool2dPerDeviceState>(PER_DEVICE_STATE);
+  Pool2DPerDeviceState state =
+      acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permission::RW>(INPUT);
-  auto output = acc.get_tensor<Permission::RO>(OUTPUT);
-  auto output_grad = acc.get_tensor<Permission::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
+  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
-                 profilng,
+                 profiling,
                  "[Pool2D] backward_time = %.2lfms\n",
                  state,
                  input.get_float_ptr(),
@@ -157,20 +140,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   Pool2DAttrs const &attrs,
-                                  ParallelTensorShape const &input,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
@@ -181,21 +156,21 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   auto init_accessor = env.get_init_accessor(POOL2D_INIT_TASK_ID, init_binding);
 
-  DeviceSpecific<Pool2dPerDeviceState> per_device_state =
+  DeviceSpecific<Pool2DPerDeviceState> per_device_state =
       init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
 
-  fwd_binding.bind(INPUT, input_shape);
+  fwd_binding.bind(INPUT, input.shape);
   fwd_binding.bind(OUTPUT, output_shape);
   fwd_binding.bind_arg(PROFILING, settings);
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
-  auto fwd_accessor = env.get_accessor(POOL2D_FWD_TASK_ID, fwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(POOL2D_FWD_TASK_ID, fwd_binding);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto bwd_accessor = env.get_accessor(POOL2D_BWD_TASK_ID, bwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(POOL2D_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();
@@ -207,7 +182,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<POOL2D_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_input_slot(INPUT);
   init.add_output_slot(OUTPUT);
@@ -217,28 +193,32 @@ void register_task<POOL2D_INIT_TASK_ID>() {
 
   init.add_return_value<FlexFlow::Pool2DPerDeviceState>();
 
-  register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_taks);
+  register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_task_impl);
 }
 
 template <>
 void register_task<POOL2D_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
-  fwd.add_arg_slot<Pool2DPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_unchecked_arg_slot<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task);
+  register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<POOL2D_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd, backward_task);
-}
+// template <>
+// void register_task<POOL2D_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID));
+
+//   register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/pool_2d.h b/lib/local-execution/src/ops/pool_2d.h
similarity index 97%
rename from lib/runtime/src/ops/pool_2d.h
rename to lib/local-execution/src/ops/pool_2d.h
index f8701f461e..852110e2e2 100644
--- a/lib/runtime/src/ops/pool_2d.h
+++ b/lib/local-execution/src/ops/pool_2d.h
@@ -20,7 +20,7 @@ OpTaskInvocation backward(Pool2DAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   Pool2DAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input_shape,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 
diff --git a/lib/runtime/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
similarity index 60%
rename from lib/runtime/src/ops/reduce.cc
rename to lib/local-execution/src/ops/reduce.cc
index 2674dc4fef..d502a2b669 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -1,27 +1,12 @@
 #include "reduce.h"
 #include "kernels/reduce_kernels.h"
-#include "legion/legion_utilities.h"
-#include "op-attrs/get_output_shape.h"
-#include "utils/exceptions.h"
+
+#include "op-attrs/get_output_shapes.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include "utils/type_traits_core.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reduce;
 
@@ -54,42 +39,34 @@ static DeviceSpecific<ReducePerDeviceState>
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  OperatorType = attrs.op_type;
+  OperatorType op_type = attrs.op_type;
   // Note: How to set the reduction size?
   size_t reduction_size = input.shape.get_volume() / output.shape.get_volume();
   DeviceSpecific<ReducePerDeviceState> per_device_state =
-      acc.create_device_specific<ReducePerDeviceState>(init_kernel(
-          handle, op_type, reduction_size, input.shape, output.shape));
+      init_kernel(handle, op_type, reduction_size, input.shape, output.shape);
   return per_device_state;
 }
 
-static DeviceSpecific<TransposePerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
 template <>
 void register_task<TRANSPOSE_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT)
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
-      init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
 
   init.add_return_value<ReducePerDeviceState>();
 
-  register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
+  register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task_impl);
 }
 
 // Note: forward_kernel only needs ReducePerDeviceState, input, output
 OpTaskInvocation forward(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
-  bind.bind_arg(PROFILING, profiling_tensor());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<ReducePerDeviceState>());
+  binding.bind_arg(PROFILING, profiling_settings());
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
@@ -97,7 +74,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) {
   return {REDUCE_FWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -113,25 +90,18 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
 template <>
 void register_task<REDUCE_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FORWARD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
-  fwd.add_unchecked_arg_slot<PerDeviceOpState>(PER_DEVICE_STATE);
+  fwd.add_unchecked_arg_slot<ReducePerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task);
+  register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task_impl);
 }
 
 OpTaskInvocation backward(ReduceAttrs const &attrs) {
@@ -140,48 +110,44 @@ OpTaskInvocation backward(ReduceAttrs const &attrs) {
   return {REDUCE_BWD_TASK_ID, binding};
 }
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Reduce] backward_time = %.2lfms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
+// TODO: OpTaskSignature
 
-template <>
-void register_task<REDUCE_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
+// template <>
+// void register_task<REDUCE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
 
-  reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
-}
+//   register_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd,
+//   backward_task_impl);
+// }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReduceAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(HANDLE, ff_handle());
+  init_binding.bind_arg(HANDLE, ff_handle());
 
   auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding);
   DeviceSpecific<ReducePerDeviceState> per_device_state =
@@ -189,10 +155,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   SimTaskBinding fwd_binding;
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
-  fwd.bind(INPUT, input.shape);
-  fwd.bind(OUTPUT, output_shape);
-  fwd.bind_arg(PROFILING, settings);
-  fwd.bind_arg(PER_DEVICE_STATE, per_device_state);
+  fwd_binding.bind(INPUT, input.shape);
+  fwd_binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind_arg(PROFILING, settings);
+  fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
diff --git a/lib/runtime/src/ops/reduce.h b/lib/local-execution/src/ops/reduce.h
similarity index 98%
rename from lib/runtime/src/ops/reduce.h
rename to lib/local-execution/src/ops/reduce.h
index 099083ed67..4c22a9127e 100644
--- a/lib/runtime/src/ops/reduce.h
+++ b/lib/local-execution/src/ops/reduce.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
 #include "op-attrs/ops/reduce.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
similarity index 58%
rename from lib/runtime/src/ops/reduction.cc
rename to lib/local-execution/src/ops/reduction.cc
index 9a11d3a6f5..31b3e2458d 100644
--- a/lib/runtime/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -13,32 +13,14 @@
  * limitations under the License.
  */
 
-#include "parallel_ops/reduction.h"
+#include "reduction.h"
 #include "kernels/reduction_kernels.h"
-#include "op-attrs/get_output_shape.h"
-#include "utils/exceptions.h"
+#include "op-attrs/get_output_shapes.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reduction;
 
@@ -61,7 +43,7 @@ OpTaskInvocation backward(ReductionAttrs const &attrs) {
   return {REDUCTION_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling_settings =
       acc.get_argument<ProfilingSettings>(PROFILING);
 
@@ -71,40 +53,25 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   size_t num_replicas = attrs.reduction_degree;
 
-  return profiling(forward_kernel,
-                   profiling_settings,
-                   "[Reduction] forward_time = %.2lfms\n",
-                   input,
-                   output,
-                   num_replicas);
+  return profile(forward_kernel,
+                 profiling_settings,
+                 "[Reduction] forward_time = %.2lfms\n",
+                 input,
+                 output,
+                 num_replicas);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-  return profiling(backward_kernel,
-                   profiling,
-                   "[Reduction] backward_time = %.2lfms\n",
-                   input_grad,
-                   output_grad);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reduction] backward_time = %.2lfms\n",
+                 input_grad,
+                 output_grad);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -114,13 +81,13 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   MachineView const &machine_view) {
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind_arg(PROFILING, settings);
   fwd_binding.bind_arg(ATTRS, attrs);
   fwd_binding.bind(INPUT, input.shape);
-  fwd.binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind(OUTPUT, output_shape);
 
   auto fwd_accessor = env.get_fwd_accessor(REDUCTION_FWD_TASK_ID, fwd_binding);
 
@@ -137,7 +104,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REDUCTION_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_arg_slot<ReductionAttrs>(ATTRS);
@@ -145,15 +113,18 @@ void register_task<REDUCTION_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task);
+  register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REDUCTION_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<REDUCTION_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID));
+
+//   register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reduction.h b/lib/local-execution/src/ops/reduction.h
similarity index 96%
rename from lib/runtime/src/ops/reduction.h
rename to lib/local-execution/src/ops/reduction.h
index 978ca6b080..071c4d2a7b 100644
--- a/lib/runtime/src/ops/reduction.h
+++ b/lib/local-execution/src/ops/reduction.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REDUCTION_H
 #define _FLEXFLOW_REDUCTION_H
 
-#include "op-attrs/ops/combine.h"
+#include "op-attrs/ops/reduction.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/repartition.h b/lib/local-execution/src/ops/repartition.h
similarity index 98%
rename from lib/runtime/src/ops/repartition.h
rename to lib/local-execution/src/ops/repartition.h
index fccc0de7be..0c8cdaf0f9 100644
--- a/lib/runtime/src/ops/repartition.h
+++ b/lib/local-execution/src/ops/repartition.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_PARTITION_H
 
 #include "op-attrs/ops/repartition.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
similarity index 65%
rename from lib/runtime/src/ops/replicate.cc
rename to lib/local-execution/src/ops/replicate.cc
index 1675a62c5f..fa13766d9e 100644
--- a/lib/runtime/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -13,39 +13,21 @@
  * limitations under the License.
  */
 
-#include "parallel_ops/replicate.h"
+#include "replicate.h"
 #include "kernels/replicate_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/graph/serialparallel.h"
 #include "utils/hash-utils.h"
 #include <sys/types.h>
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Replicate;
 
-enum Slots { INPUT, OUTPUT, PROFILING };
+enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
 
 OpTaskInvocation forward(ReplicateAttrs const &attrs) {
   OpTaskBinding binding;
@@ -54,6 +36,7 @@ OpTaskInvocation forward(ReplicateAttrs const &attrs) {
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(ATTRS, attrs);
 
   return {REPLICATE_FWD_TASK_ID, binding};
 }
@@ -63,7 +46,7 @@ OpTaskInvocation backward(ReplicateAttrs const &attrs) {
   return {REPLICATE_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -76,33 +59,20 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto const &attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
 
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = %.2lfms\n",
                  input_grad,
-                 output_grad);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 output_grad,
+                 attrs.replicate_degree); // is this `num_replicas`?
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -110,7 +80,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
   SimTaskBinding fwd_binding;
   fwd_binding.bind_arg(PROFILING, settings);
   ParallelTensorShape output = get_output_shape(attrs, input.shape);
@@ -130,20 +100,25 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REPLICATE_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task);
+  register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REPLICATE_BWD_TASK_ID>() {
-  OpTaskSignature bwd = infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<REPLICATE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//   infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID));
+
+//   register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/replicate.h b/lib/local-execution/src/ops/replicate.h
similarity index 95%
rename from lib/runtime/src/ops/replicate.h
rename to lib/local-execution/src/ops/replicate.h
index da2b71f098..510676931b 100644
--- a/lib/runtime/src/ops/replicate.h
+++ b/lib/local-execution/src/ops/replicate.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_REPLICATE_H
 
 #include "op-attrs/ops/replicate.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
similarity index 68%
rename from lib/runtime/src/ops/reshape.cc
rename to lib/local-execution/src/ops/reshape.cc
index c9dc8cff8d..2b3200d79d 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -15,24 +15,10 @@
 
 #include "reshape.h"
 #include "kernels/reshape_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reshape;
 
@@ -69,24 +55,14 @@ static DeviceSpecific<ReshapePerDeviceState>
   auto attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
 
   DeviceSpecific<ReshapePerDeviceState> per_device_state =
-      acc.create_device_specific<ReshapePerDeviceState>(
-          init_kernel(attrs.shape.data_type));
+      init_kernel(attrs.shape.data_type);
   return per_device_state;
 }
 
-static DeviceSpecific<ReshapePerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
-  Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -99,18 +75,11 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
-  Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
@@ -123,20 +92,13 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReshapeAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
 
+  auto env = sim_factory.new_environment();
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
   auto init_accessor =
@@ -164,18 +126,20 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<RESHAPE_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_arg_slot<ReshapeAttrs>(ATTRS);
 
-  init.add_return_value<ReshapePerDeviceState>(PER_DEVICE_STATE);
+  init.add_return_value<ReshapePerDeviceState>();
 
-  register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
+  register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task_impl);
 }
 
 template <>
 void register_task<RESHAPE_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<ReshapePerDeviceState>(PER_DEVICE_STATE);
@@ -183,15 +147,17 @@ void register_task<RESHAPE_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task);
+  register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<RESHAPE_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<RESHAPE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
+
+//   register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reshape.h b/lib/local-execution/src/ops/reshape.h
similarity index 98%
rename from lib/runtime/src/ops/reshape.h
rename to lib/local-execution/src/ops/reshape.h
index f044e3f057..0b845de5fc 100644
--- a/lib/runtime/src/ops/reshape.h
+++ b/lib/local-execution/src/ops/reshape.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_RESHAPE_H
 
 #include "op-attrs/ops/reshape.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
similarity index 69%
rename from lib/runtime/src/ops/reverse.cc
rename to lib/local-execution/src/ops/reverse.cc
index ac64146cd1..6c28966e6e 100644
--- a/lib/runtime/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -19,23 +19,9 @@
 #include "op-attrs/get_output_shapes.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reverse;
+using coord_t = long long;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
 
@@ -43,7 +29,7 @@ OpTaskInvocation forward(ReverseAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PROFILING, profiling_settings());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
@@ -56,22 +42,22 @@ OpTaskInvocation backward(ReverseAttrs const &attrs) {
   return {REVERSE_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int output_size = outtput.shape.get_volume();
+  int output_size = output.shape.get_volume();
   auto axis = attrs.axis;
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < output.shape.get_dim(); i++) {
     if (i < axis) {
-      in_blk_size *= output.shape[i];
+      in_blk_size *= output.shape.at(ff_dim_t(i));
     } else if (i == axis) {
-      reverse_dim_size = output.shape[i];
+      reverse_dim_size = output.shape.at(ff_dim_t(i));
     } else {
-      num_out_blks *= output.shape[i];
+      num_out_blks *= output.shape.at(ff_dim_t(i));
     }
   }
 
@@ -86,29 +72,22 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output_size);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input.shape.get_dim() - attrs.axis - 1;
+  int axis = input_grad.shape.get_dim() - attrs.axis.value() - 1;
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < input_grad.shape.get_dim(); i++) {
     if (i < axis) {
-      in_blk_size *= input_grad.shape[i];
+      in_blk_size *= input_grad.shape.at(ff_dim_t(i));
     } else if (i == axis) {
-      reverse_dim_size = input_grad.shape[i];
+      reverse_dim_size = input_grad.shape.at(ff_dim_t(i));
     } else {
-      num_out_blks *= input_grad.shape[i];
+      num_out_blks *= input_grad.shape.at(ff_dim_t(i));
     }
   }
 
@@ -120,15 +99,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  num_out_blks,
                  reverse_dim_size,
                  in_blk_size,
-                 input.shape.get_volume());
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 input_grad.shape.get_volume());
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -136,7 +107,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding fwd_binding;
 
@@ -161,21 +132,24 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 }
 
 template <>
-void register_task<REVERSE_FWD_TASK_ID>()) {
-  OpTaskSignature fwd(OpTaskType::FWD);
+void register_task<REVERSE_FWD_TASK_ID>() {
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task);
+  register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REVERSE_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID));
-  register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, backward_task);
-}
+// TODO: OpTaskSignature
+// template <>
+// void register_task<REVERSE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID));
+//   register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reverse.h b/lib/local-execution/src/ops/reverse.h
similarity index 95%
rename from lib/runtime/src/ops/reverse.h
rename to lib/local-execution/src/ops/reverse.h
index af4d335429..68545644bd 100644
--- a/lib/runtime/src/ops/reverse.h
+++ b/lib/local-execution/src/ops/reverse.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_REVERSE_H_
 
 #include "op-attrs/ops/reverse.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
similarity index 67%
rename from lib/runtime/src/ops/softmax.cc
rename to lib/local-execution/src/ops/softmax.cc
index b67f9730a4..054b3bc7db 100644
--- a/lib/runtime/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -17,26 +17,10 @@
 #include "kernels/softmax_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
 using namespace FlexFlow::Kernels::Softmax;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE };
@@ -75,21 +59,11 @@ static DeviceSpecific<SoftmaxPerDeviceState>
   auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
 
   DeviceSpecific<SoftmaxPerDeviceState> per_device_state =
-      acc.create_device_specific<SoftmaxPerDeviceState>(
-          init_kernel(handle, attrs.dim));
+      init_kernel(handle, attrs.dim.value());
   return per_device_state;
 }
 
-static DeviceSpecific<SoftmaxPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -101,18 +75,11 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "[SoftMax] forward_time = %.2lfms\n",
                  per_device_state,
                  input.get_float_ptr(),
-                 output.get_float_ptr(), );
+                 output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -124,22 +91,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   assert(output_grad.shape == output.shape);
 
-  return profile(
-      backward_kernel,
-      profiling,
-      "[SoftMax] backward_time = %.2lfms\n",
-      input_grad.get_float_ptr(),
-      output_grad.get_float_ptr(),
-      output_grad.shape.volume(), // Note(lambda): get num_elements, maybe wrong
-  );
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  return profile(backward_kernel,
+                 profiling,
+                 "[SoftMax] backward_time = %.2lfms\n",
+                 input_grad.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 output_grad.shape.get_volume());
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -147,7 +104,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
@@ -162,7 +119,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
       init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
-  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
   fwd_binding.bind(INPUT, input.shape);
   fwd_binding.bind(OUTPUT, output_shape);
   fwd_binding.bind_arg(PROFILING, settings);
@@ -182,18 +138,20 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<SOFTMAX_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<SoftmaxAttrs>(ATTRS);
-  init.add_return_value_slot<SoftmaxPerDeviceState>();
+  init.add_return_value<SoftmaxPerDeviceState>();
 
-  register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task);
+  register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task_impl);
 }
 
 template <>
 void register_task<SOFTMAX_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<SoftmaxPerDeviceState>(PER_DEVICE_STATE);
@@ -201,15 +159,17 @@ void register_task<SOFTMAX_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task);
+  register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<SOFTMAX_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<SOFTMAX_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID));
+
+//   register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/softmax.h b/lib/local-execution/src/ops/softmax.h
similarity index 98%
rename from lib/runtime/src/ops/softmax.h
rename to lib/local-execution/src/ops/softmax.h
index 06b9d09d60..8fe2f96eb5 100644
--- a/lib/runtime/src/ops/softmax.h
+++ b/lib/local-execution/src/ops/softmax.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_SOFTMAX_H
 
 #include "op-attrs/ops/softmax.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
similarity index 67%
rename from lib/runtime/src/ops/split.cc
rename to lib/local-execution/src/ops/split.cc
index 2af5d42874..3661d6e074 100644
--- a/lib/runtime/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -16,28 +16,14 @@
 #include "split.h"
 #include "kernels/array_shape.h"
 #include "kernels/split_kernels.h"
-#include "utils/exceptions.h"
+#include "op-attrs/get_output_shapes.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
 
 using namespace FlexFlow::Kernels::Split;
+using coord_t = long long;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
 
@@ -58,24 +44,40 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
   return {SPLIT_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+void calc_block_size(coord_t &num_blks,
+                     coord_t &blk_size,
+                     ArrayShape const &array_shape,
+                     int axis) {
+  num_blks = 1;
+  blk_size = 1;
+  for (int d = 0; d < array_shape.num_elements(); d++) {
+    if (d <= axis) {
+      blk_size *= array_shape.at(legion_dim_t(d));
+    } else {
+      num_blks *= array_shape.at(legion_dim_t(d));
+    }
+  }
+}
+
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
   coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis);
+  calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis.value());
 
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blks;
     calc_block_size(
-        out_num_blks, out_blk_size[i], output.shape, split->legion_axis);
+        out_num_blks, out_blk_size[i], output.shape, attrs.axis.value());
   }
+  float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
                  profiling,
                  "Split forward_time = %.2lfms\n",
-                 &output.get_float_ptr(),
+                 &output_float_ptr,
                  input.get_float_ptr(),
                  out_blk_size,
                  in_blk_size,
@@ -83,71 +85,42 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.splits.size());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
 // maybe we should add assert like the original code
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
   coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blks, in_blk_size, input_grade.shape, attrs.axis);
+  calc_block_size(num_blks, in_blk_size, input_grad.shape, attrs.axis.value());
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blks;
     calc_block_size(
-        out_num_blks, out_blk_size[i], output_grad.shape, split->legion_axis);
+        out_num_blks, out_blk_size[i], output_grad.shape, attrs.axis.value());
   }
+  float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
                  profiling,
                  "Split backward_time = %.2lfms\n",
                  input_grad.get_float_ptr(),
-                 &output_grad.get_float_ptr(),
+                 &output_grad_ptr,
                  out_blk_size,
                  in_blk_size,
                  num_blks,
                  attrs.splits.size());
 }
 
-void calc_block_size(coord_t &num_blks,
-                     coord_t &blk_size,
-                     ArrayShape const &array_shape,
-                     int axis) {
-  num_blks = 1;
-  blk_size = 1;
-  for (int d = 0; d < array_shape.get_dim(); d++) {
-    if (d <= axis) {
-      blk_size *= (domain.hi()[d] - domain.lo()[d] + 1);
-      blk_size *= array_shape.at(legion_dim_t(d)) + 1
-    } else {
-      num_blks *= array_shape.at(legion_dim_t(d)) + 1
-    }
-  }
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   SplitAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+  std::vector<ParallelTensorShape> output_shape =
+      get_output_shapes(attrs, input.shape);
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind(INPUT, input.shape);
@@ -166,23 +139,26 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
+// TODO: OpTaskSignature
+
 template <>
 void register_task<SPLIT_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
-  register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task);
+  register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<SPLIT_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID));
+// template <>
+// void register_task<SPLIT_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID));
 
-  register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task);
-}
+//   register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/split.h b/lib/local-execution/src/ops/split.h
similarity index 95%
rename from lib/runtime/src/ops/split.h
rename to lib/local-execution/src/ops/split.h
index d63212e836..1fdfdc2432 100644
--- a/lib/runtime/src/ops/split.h
+++ b/lib/local-execution/src/ops/split.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_SPLIT_H
 
 #include "op-attrs/ops/split.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
@@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   SplitAttrs const &attrs,
-                                  InputParallelTensorDes const &input,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 
diff --git a/lib/runtime/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
similarity index 59%
rename from lib/runtime/src/ops/topk.cc
rename to lib/local-execution/src/ops/topk.cc
index 958516a6d9..5fb2c6842f 100644
--- a/lib/runtime/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -16,28 +16,9 @@
 #include "topk.h"
 #include "kernels/topk_kernels.h"
 #include "op-attrs/get_output_shapes.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
 
 using namespace FlexFlow::Kernels::TopK;
 
@@ -50,7 +31,7 @@ enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE };
 OpTaskInvocation init(TopKAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
 
   return {TOPK_INIT_TASK_ID, binding};
 }
@@ -60,7 +41,7 @@ OpTaskInvocation forward(TopKAttrs const &attrs) {
 
   binding.bind_arg(PER_DEVICE_STATE, per_device_op_state<TopKPerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
@@ -81,23 +62,14 @@ static DeviceSpecific<TopKPerDeviceState>
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
 
   DeviceSpecific<TopKPerDeviceState> per_device_state =
-      acc.create_device_specific<TopKPerDeviceState>(init_kernel(attrs.sorted));
+      init_kernel(attrs.sorted);
   return per_device_state;
 }
 
-static DeviceSpecific<TopKPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
   auto per_device_state =
-      acc.get_device_specific<TopKPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -107,31 +79,24 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   size_t batch_size = input.shape.get_volume() / length;
   auto indices = acc.get_tensor<Permissions::WO>(INDICES);
 
-  return profiling(forward_kernel,
-                   profiling,
-                   "[TopK] forward_time = %.2lfms\n",
-                   per_device_state,
-                   input.get_float_ptr(),
-                   output.get_float_ptr(),
-                   indices.get_int32_ptr(),
-                   batch_size,
-                   length,
-                   attrs.k,
-                   attrs.sorted);
+  return profile(forward_kernel,
+                 profiling,
+                 "[TopK] forward_time = %.2lfms\n",
+                 per_device_state,
+                 input.get_float_ptr(),
+                 output.get_float_ptr(),
+                 indices.get_int32_ptr(),
+                 batch_size,
+                 length,
+                 attrs.k,
+                 attrs.sorted);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
   auto per_device_state =
-      acc.get_device_specific<TopKPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -139,27 +104,19 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
-  int length = input.shape.at(legion_dim_t(0)) + 1;
-  size_t batch_size = input.shape.get_volume() / length;
-
-  return profiling(backward_kernel,
-                   profiling,
-                   "[TopK] backward_time = %.2lfms\n",
-                   per_device_state,
-                   output_grad.get_float_ptr(),
-                   indices.get_int32_ptr(),
-                   input_grad.get_float_ptr(),
-                   batch_size,
-                   length,
-                   attrs.k);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  int length = input_grad.shape.at(legion_dim_t(0)) + 1;
+  size_t batch_size = input_grad.shape.get_volume() / length;
+
+  return profile(backward_kernel,
+                 profiling,
+                 "[TopK] backward_time = %.2lfms\n",
+                 per_device_state,
+                 output_grad.get_float_ptr(),
+                 indices.get_int32_ptr(),
+                 input_grad.get_float_ptr(),
+                 batch_size,
+                 length,
+                 attrs.k);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -167,9 +124,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shapes(attrs, input.shape);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
@@ -200,16 +157,18 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<TOPK_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init;
+  init.type = OpTaskType::INIT;
 
   init.add_arg_slot<TopKAttrs>(ATTRS); // Note: this may have some question
   init.add_return_value<TopKPerDeviceState>();
-  register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task);
+  register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task_impl);
 }
 
 template <>
 void register_task<TOPK_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
+  OpTaskSignature fwd;
+  fwd.type = OpTaskType::FWD;
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_arg_slot<TopKAttrs>(ATTRS);
@@ -219,14 +178,17 @@ void register_task<TOPK_FWD_TASK_ID>() {
   fwd.add_output_slot(OUTPUT);
   fwd.add_output_slot(INDICES);
 
-  register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task);
+  register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<TOPK_BWD_TASK_ID>() {
-  OpTaskSignature bwd = infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task);
-}
+// template <>
+// void register_task<TOPK_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//   infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID));
+
+//   register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/topk.h b/lib/local-execution/src/ops/topk.h
similarity index 98%
rename from lib/runtime/src/ops/topk.h
rename to lib/local-execution/src/ops/topk.h
index f15ff6de81..fcab2a5a31 100644
--- a/lib/runtime/src/ops/topk.h
+++ b/lib/local-execution/src/ops/topk.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_TOPK_H_
 
 #include "op-attrs/ops/topk.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
similarity index 55%
rename from lib/runtime/src/ops/transpose.cc
rename to lib/local-execution/src/ops/transpose.cc
index ea6182772f..f580a46792 100644
--- a/lib/runtime/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -15,27 +15,10 @@
 
 #include "transpose.h"
 #include "kernels/transpose_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/transpose.h"
 #include "utils/exception.decl.h"
 
-namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
 using namespace FlexFlow::Kernels::Transpose;
 
 namespace FlexFlow {
@@ -57,33 +40,26 @@ OpTaskInvocation init(TransposeAttrs const &attrs) {
 static DeviceSpecific<TransposePerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-  std::vector<int> perm = attrs.perm; // default convert stack_vector to vector
+  std::vector<ff_dim_t> perm = static_cast<std::vector<ff_dim_t>>(attrs.perm);
   DeviceSpecific<TransposePerDeviceState> per_device_state =
-      acc.create_device_specific<TransposePerDeviceState>(
-          init_kernel(perm.size(), perm));
+      init_kernel(perm.size(), perm);
 
   return per_device_state;
 }
 
-static DeviceSpecific<TransposePerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
+// TODO: OpTaskSignature
 
-template <>
-void register_task<TRANSPOSE_INIT_TASK_ID>();
-OpTaskSignature init(OpTaskType::INIT)
+// template <>
+// void register_task<TRANSPOSE_INIT_TASK_ID>() {
+// OpTaskSignature init(OpTaskType::INIT);
 
-    init.add_arg_slot<TransposeAttrs>(ATTRS);
+//     init.add_arg_slot<TransposeAttrs>(ATTRS);
 
-init.add_return_value<TransposePerDeviceState>();
+// init.add_return_value<TransposePerDeviceState>();
 
-register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init, init_task);
-} // namespace FlexFlow
+// register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init,
+// init_task_impl);
+// }
 
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
@@ -92,13 +68,13 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) {
                    per_device_op_state<TransposePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
 
-  bind.bind(INPUT, input_tensor(0));
-  bind.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
   return {TRANSPOSE_FWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
@@ -106,47 +82,32 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  return profiling(forward_kernel,
-                   profiling,
-                   "[Transpose] Forward_time = %.2lf [ms]",
-                   per_device_state,
-                   input,
-                   output);
+  return profile(forward_kernel,
+                 profiling,
+                 "[Transpose] Forward_time = %.2lf [ms]",
+                 per_device_state,
+                 input,
+                 output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
-      acc.get_per_device_state<TransposePerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
-  return profiling(backward_kernel,
-                   profiling,
-                   "[Transpose] Backward_time = %.2lf [ms]",
-                   per_device_state,
-                   input_grad,
-                   output_grad);
+  return profile(backward_kernel,
+                 profiling,
+                 "[Transpose] Backward_time = %.2lf [ms]",
+                 per_device_state,
+                 input_grad,
+                 output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
-OpTaskInvocation backward(TransposeAttrs const &) {
+OpTaskInvocation backward(TransposeAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
   return {TRANSPOSE_BWD_TASK_ID, binding};
@@ -159,7 +120,7 @@ CostMetrics
                               &input_descs, // Note:this may have some problem
                           ProfilingSettings const &settings,
                           MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
@@ -169,12 +130,13 @@ CostMetrics
   DeviceSpecific<TransposePerDeviceState> per_device_state =
       init_task_impl(init_accessor);
 
-  ParallelTensorShape output_shape = get_output_shape(attrs, input_descs.shape);
+  ParallelTensorShape output_shape =
+      get_output_shape(attrs, input_descs.shapes);
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
   fwd_binding.bind_arg(PROFILING, settings);
-  fwd_binding.bind(INPUT, input_descs.shape);
+  fwd_binding.bind(INPUT, input_descs.shapes);
   fwd_binding.bind(OUTPUT, output_shape);
 
   auto fwd_accessor = env.get_fwd_accessor(TRANSPOSE_FWD_TASK_ID, fwd_binding);
@@ -189,4 +151,4 @@ CostMetrics
   return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/lib/runtime/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h
similarity index 98%
rename from lib/runtime/src/ops/transpose.h
rename to lib/local-execution/src/ops/transpose.h
index 52e824ebbf..6c6dffdc8a 100644
--- a/lib/runtime/src/ops/transpose.h
+++ b/lib/local-execution/src/ops/transpose.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_TRANSPOSE_H_
 
 #include "op-attrs/ops/transpose.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 

From 931b47c4bf710c85d3b9fb6cc09e8a3e6a7d1306 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 14 May 2024 07:00:01 -0700
Subject: [PATCH 04/24] Format

---
 lib/local-execution/include/tracked_allocator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h
index 64cc31e858..766411357a 100644
--- a/lib/local-execution/include/tracked_allocator.h
+++ b/lib/local-execution/include/tracked_allocator.h
@@ -5,7 +5,7 @@
 
 namespace FlexFlow {
 
-struct TrackedAllocator: public Allocator {
+struct TrackedAllocator : public Allocator {
   Allocator() = delete;
 
   void *allocate(size_t mem_size);

From 8a66ed93a52ecfc0212ebd8ee71fda105a1a64af Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 14 May 2024 07:01:33 -0700
Subject: [PATCH 05/24] Format

---
 lib/local-execution/include/tracked_allocator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h
index 64cc31e858..766411357a 100644
--- a/lib/local-execution/include/tracked_allocator.h
+++ b/lib/local-execution/include/tracked_allocator.h
@@ -5,7 +5,7 @@
 
 namespace FlexFlow {
 
-struct TrackedAllocator: public Allocator {
+struct TrackedAllocator : public Allocator {
   Allocator() = delete;
 
   void *allocate(size_t mem_size);

From 3ffe2392271220f0b22b11e820ca01cf6c094b33 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 21 May 2024 15:58:07 -0700
Subject: [PATCH 06/24] Fix tracked allocator

---
 .../include/tracked_allocator.h               | 14 ++++++++++----
 lib/local-execution/src/tracked_allocator.cc  | 19 ++++++++++---------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h
index 766411357a..510716b3af 100644
--- a/lib/local-execution/include/tracked_allocator.h
+++ b/lib/local-execution/include/tracked_allocator.h
@@ -5,16 +5,22 @@
 
 namespace FlexFlow {
 
-struct TrackedAllocator : public Allocator {
-  Allocator() = delete;
+struct TrackedAllocator : public IAllocator {
+  TrackedAllocator() = default;
+  TrackedAllocator(TrackedAllocator const &) = delete;
+  TrackedAllocator(TrackedAllocator &&) = delete;
+  ~TrackedAllocator() = default;
 
-  void *allocate(size_t mem_size);
-  void deallocate(void *ptr);
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
   size_t get_current_mem_usage();
 
 private:
   size_t current_mem_usage;
 };
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(TrackedAllocator);
+
+Allocator get_tracked_memory_allocator();
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index 6e666b647c..a3ccdc02b5 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -3,25 +3,26 @@
 
 namespace FlexFlow {
 
-void *TrackedAllocator::allocate(size_t mem_size) {
-  void *ptr = this->i_allocator->allocate(mem_size);
-  this->curr_mem_usage += mem_size;
+void *TrackedAllocator::allocate(size_t requested_memory_size) {
+  void *ptr;
+  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
+  this->current_mem_usage += requested_memory_size;
   return ptr;
 }
 
 void TrackedAllocator::deallocate(void *ptr) {
   size_t psize;
-  checkCUDA(cuMemGetAddressRange(nullptr, &psize, ptr));
-  this->i_allocator->deallocate(ptr);
-  this->curr_mem_usage -= psize;
+  checkCUDA(cudaGetSymbolSize(&psize, ptr));
+  checkCUDA(cudaFree(ptr));
+  this->current_mem_usage -= psize;
 }
 
 size_t TrackedAllocator::get_current_mem_usage() {
-  return this->curr_mem_usage;
+  return this->current_mem_usage;
 }
 
-TrackedAllocator get_tracked_local_allocator() {
-  return Allocator::create<LocalAllocator>();
+Allocator get_tracked_memory_allocator() {
+  return Allocator::create<TrackedAllocator>();
 }
 
 } // namespace FlexFlow

From da10906c6edd8051cdc0455fca3f62e82863affb Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 21 May 2024 17:08:58 -0700
Subject: [PATCH 07/24] Fix comp graph

---
 lib/op-attrs/include/op-attrs/ops/linear.h    |   4 +
 lib/pcg/include/pcg/computation_graph.h       | 100 ++++--------------
 .../include/pcg/computation_graph_builder.h   |   9 +-
 lib/pcg/include/pcg/tensor.h                  |   2 +
 lib/pcg/src/computation_graph.cc              |  79 ++++++++++++++
 lib/pcg/src/computation_graph_builder.cc      |  98 ++++++++++-------
 lib/pcg/src/tensor.cc                         |   4 +
 7 files changed, 173 insertions(+), 123 deletions(-)
 create mode 100644 lib/pcg/src/computation_graph.cc

diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index 0eb7ccec45..a46df59282 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -34,6 +34,10 @@ FF_VISITABLE_STRUCT(
     LinearAttrs, out_channels, use_bias, data_type, activation, regularizer);
 CHECK_VALID_OP_ATTR(LinearAttrs);
 
+TensorShape get_weights_shape(LinearAttrs const &attrs,
+                              TensorShape const &input);
+TensorShape get_bias_shape(LinearAttrs const &attrs, TensorShape const &input);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 4d4fa86efa..53aa7eb820 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -17,82 +17,6 @@ struct ComputationGraph
                             OutputLabelledMultiDiGraph<Layer, Tensor>> {
   using strong_typedef::strong_typedef;
 
-  std::vector<operator_guid_t> traverse() {
-    std::vector<Node> layers = get_topological_ordering(this->value());
-    return transform(layers, [&](Node const &e) -> operator_guid_t {
-      return operator_guid_t{e};
-    });
-  }
-
-  std::vector<operator_guid_t> traverse_reverse_order() {
-    std::vector<Node> layers =
-        reversed<std::vector<Node>>(get_topological_ordering(this->value()));
-    return transform(layers, [&](Node const &e) -> operator_guid_t {
-      return operator_guid_t{e};
-    });
-  }
-
-  bool out_edge_comparator(MultiDiOutput x, MultiDiOutput y) {
-    return x.src_idx < y.src_idx;
-  }
-
-  std::vector<tensor_guid_t>
-      sort_edge_set(std::unordered_set<MultiDiEdge> edges) {
-    std::unordered_set<MultiDiOutput> outputs =
-        transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput {
-          return MultiDiOutput(e);
-        });
-    std::vector<MultiDiOutput> sorted_outputs(outputs.begin(), outputs.end());
-    sort(sorted_outputs.begin(), sorted_outputs.end(), out_edge_comparator);
-    return transform(sorted_outputs,
-                     [&](MultiDiOutput const &e) -> tensor_guid_t {
-                       return tensor_guid_t{e};
-                     });
-  }
-
-  std::vector<tensor_guid_t> get_outgoing_tensors(operator_guid_t n) {
-    return sort_edge_set(get_outgoing_edges(this->value(), n.value()));
-  }
-
-  std::vector<tensor_guid_t> get_incoming_tensors(operator_guid_t n) {
-    return sort_edge_set(get_incoming_edges(this->value(), n.value()));
-  }
-
-  operator_guid_t add_node(Layer const &layer) {
-    Node added_node = this->value().add_node(layer);
-    return operator_guid_t{added_node};
-  }
-
-  void add_output(tensor_guid_t const &output, Tensor const &tensor) {
-    this->value().add_output(output.value(), tensor);
-  }
-
-  tensor_guid_t create_outgoing_edge(operator_guid_t node, int idx) {
-    MultiDiOutput edge = {node.value(), NodePort{idx}};
-    return tensor_guid_t{edge};
-  }
-
-  tensor_guid_t create_outgoing_edge_with_label(operator_guid_t node,
-                                                int idx,
-                                                Tensor tensor) {
-    tensor_guid_t tensor_guid = create_outgoing_edge(node, idx);
-    add_output(tensor_guid, tensor);
-    return tensor_guid;
-  }
-
-  void add_incoming_edges(std::vector<tensor_guid_t> const &incoming_edges,
-                          operator_guid_t node) {
-    size_t incoming_edge_dst_port = 0;
-    for (tensor_guid_t input : incoming_edges) {
-      MultiDiOutput input_view = input.value();
-      MultiDiEdge edge = {node.value(),
-                          NodePort{incoming_edge_dst_port++},
-                          input_view.src,
-                          input_view.src_idx};
-      this->value().add_edge(edge);
-    }
-  }
-
   Layer &at(operator_guid_t const &n) {
     return this->value().at(n.value());
   }
@@ -108,13 +32,29 @@ struct ComputationGraph
   Tensor const &at(tensor_guid_t const &e) const {
     return this->value().at(e.value());
   }
-
-  CompGraphOperatorAttrs get_layer_attrs(operator_guid_t const &n) const {
-    return this->at(n).attrs;
-  }
 };
 CHECK_WELL_BEHAVED_VALUE_TYPE_NO_HASH(ComputationGraph);
 
+std::vector<operator_guid_t>
+    traverse_comp_graph(ComputationGraph const &comp_graph);
+std::vector<operator_guid_t>
+    traverse_comp_graph_backwards(ComputationGraph const &comp_graph);
+std::vector<tensor_guid_t>
+    get_outgoing_tensors(ComputationGraph const &comp_graph, operator_guid_t n);
+std::vector<tensor_guid_t>
+    get_incoming_tensors(ComputationGraph const &comp_graph, operator_guid_t n);
+operator_guid_t add_node(ComputationGraph &comp_graph, Layer const &layer);
+tensor_guid_t create_outgoing_edge_with_label(ComputationGraph &comp_graph,
+                                              operator_guid_t node,
+                                              int idx,
+                                              Tensor tensor);
+
+void add_incoming_edges(ComputationGraph &comp_graph,
+                        std::vector<tensor_guid_t> const &incoming_edges,
+                        operator_guid_t node);
+CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph,
+                                       operator_guid_t const &n);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 1be8d7ad0e..7ba95d701b 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -250,18 +250,19 @@ struct ComputationGraphBuilder
       std::vector<tensor_guid_t> const &inputs,
       std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
           &weight_shapes,
-      Tensor const &output);
+      TensorShape const &output_shape);
   std::vector<tensor_guid_t> add_layer(
       Layer const &layer,
       std::vector<tensor_guid_t> const &inputs,
       std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
           &weight_shapes,
-      std::vector<Tensor> const &outputs);
+      std::vector<TensorShape> const &output_shapes);
 
   tensor_guid_t as_type(tensor_guid_t const &, DataType, std::string const &);
 
   TensorShape get_broadcast_target_shape(std::vector<TensorShape> const &);
-
+  TensorShape get_shape(tensor_guid_t const &t);
+  std::vector<TensorShape> get_shapes(std::vector<tensor_guid_t> const &t);
   tensor_guid_t
       element_binary(OperatorType,
                      tensor_guid_t const &lhs,
@@ -286,8 +287,6 @@ struct ComputationGraphBuilder
                            tensor_guid_t const &x,
                            std::optional<std::string> const &maybe_name);
 
-  std::unordered_map<tensor_guid_t, Tensor> pre_edge_mapping;
-
 public:
   ComputationGraph computation_graph;
 };
diff --git a/lib/pcg/include/pcg/tensor.h b/lib/pcg/include/pcg/tensor.h
index 975a69809d..b5ff857a6c 100644
--- a/lib/pcg/include/pcg/tensor.h
+++ b/lib/pcg/include/pcg/tensor.h
@@ -33,6 +33,8 @@ FF_VISITABLE_STRUCT(
 
 using Parameter = Tensor;
 
+Tensor construct_tensor_from_output_shape(TensorShape const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc
new file mode 100644
index 0000000000..d8a57311bf
--- /dev/null
+++ b/lib/pcg/src/computation_graph.cc
@@ -0,0 +1,79 @@
+#include "pcg/computation_graph.h"
+
+namespace FlexFlow {
+
+std::vector<operator_guid_t> traverse_comp_graph(ComputationGraph const & comp_graph) {
+  std::vector<Node> layers = get_topological_ordering(comp_graph.value());
+  return transform(layers, [&](Node const &e) -> operator_guid_t {
+    return operator_guid_t{e};
+  });
+}
+
+std::vector<operator_guid_t> traverse_comp_graph_backwards(ComputationGraph const & comp_graph) {
+  std::vector<Node> layers =
+      reversed<std::vector<Node>>(get_topological_ordering(comp_graph.value()));
+  return transform(layers, [&](Node const &e) -> operator_guid_t {
+    return operator_guid_t{e};
+  });
+}
+
+bool src_edge_comparator(MultiDiOutput x, MultiDiOutput y) {
+  return x.src_idx < y.src_idx;
+}
+
+std::vector<tensor_guid_t>
+    sort_edge_set(std::unordered_set<MultiDiEdge> edges) {
+  std::unordered_set<MultiDiOutput> outputs =
+      transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput {
+        return MultiDiOutput(e);
+      });
+  std::vector<MultiDiOutput> sorted_outputs(outputs.begin(), outputs.end());
+  sort(sorted_outputs.begin(), sorted_outputs.end(), src_edge_comparator);
+  return transform(sorted_outputs,
+                    [&](MultiDiOutput const &e) -> tensor_guid_t {
+                      return tensor_guid_t{e};
+                    });
+}
+
+std::vector<tensor_guid_t> get_outgoing_tensors(ComputationGraph const & comp_graph,
+ operator_guid_t n) {
+  return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value()));
+}
+
+std::vector<tensor_guid_t> get_incoming_tensors(ComputationGraph const & comp_graph, operator_guid_t n) {
+  return sort_edge_set(get_incoming_edges(comp_graph.value(), n.value()));
+}
+
+operator_guid_t add_node(ComputationGraph & comp_graph, Layer const &layer) {
+  Node added_node = comp_graph.value().add_node(layer);
+  return operator_guid_t{added_node};
+}
+
+tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph,
+  operator_guid_t node,
+                                              int idx,
+                                              Tensor tensor) {
+  MultiDiOutput edge = {node.value(), NodePort{idx}};
+  comp_graph.value().add_output(edge, tensor);
+  return tensor_guid_t{edge};
+}
+
+void add_incoming_edges(ComputationGraph & comp_graph, 
+std::vector<tensor_guid_t> const &incoming_edges,
+                        operator_guid_t node) {
+  size_t incoming_edge_dst_port = 0;
+  for (tensor_guid_t input : incoming_edges) {
+    MultiDiOutput input_view = input.value();
+    MultiDiEdge edge = {node.value(),
+                        NodePort{incoming_edge_dst_port++},
+                        input_view.src,
+                        input_view.src_idx};
+    comp_graph.value().add_edge(edge);
+  }
+}
+
+CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const & comp_graph, operator_guid_t const &n) {
+  return comp_graph.at(n).attrs;
+}
+
+}
\ No newline at end of file
diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc
index f308a4b242..78e49f0695 100644
--- a/lib/pcg/src/computation_graph_builder.cc
+++ b/lib/pcg/src/computation_graph_builder.cc
@@ -11,11 +11,14 @@ tensor_guid_t ComputationGraphBuilder::add_layer(
     std::vector<tensor_guid_t> const &inputs,
     std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
-    Tensor const &output) {
-  operator_guid_t node = computation_graph.add_node(layer);
-  this->computation_graph.add_incoming_edges(inputs, node);
-  return this->computation_graph.create_outgoing_edge_with_label(
-      node, 0, output);
+    TensorShape const &output_shape) {
+  operator_guid_t node = add_node(computation_graph, layer);
+  add_incoming_edges(computation_graph, inputs, node);
+  return create_outgoing_edge_with_label(
+      computation_graph,
+      node,
+      0,
+      construct_tensor_from_output_shape(output_shape));
 }
 
 std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
@@ -23,14 +26,16 @@ std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
     std::vector<tensor_guid_t> const &inputs,
     std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
-    std::vector<Tensor> const &outputs) {
-  operator_guid_t node = computation_graph.add_node(layer);
-  this->computation_graph.add_incoming_edges(inputs, node);
+    std::vector<TensorShape> const &output_shapes) {
+  operator_guid_t node = add_node(computation_graph, layer);
+  add_incoming_edges(computation_graph, inputs, node);
   std::vector<tensor_guid_t> output_tensor_guids;
-  for (int i = 0; i < outputs.size(); ++i) {
-    output_tensor_guids.push_back(
-        this->computation_graph.create_outgoing_edge_with_label(
-            node, i, outputs[i]));
+  for (int i = 0; i < output_shapes.size(); ++i) {
+    output_tensor_guids.push_back(create_outgoing_edge_with_label(
+        computation_graph,
+        node,
+        i,
+        construct_tensor_from_output_shape(output_shapes[i])));
   }
   return output_tensor_guids;
 }
@@ -48,12 +53,13 @@ tensor_guid_t
 tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x,
                                                DataType data_type,
                                                std::string const &name) {
-  if (x.data_type < data_type) {
+  Tensor tensor = computation_graph.at(x);
+  if (tensor.data_type < data_type) {
     return this->cast(x, data_type, name);
-  } else if (x.data_type > data_type) {
+  } else if (tensor.data_type > data_type) {
     throw mk_runtime_error("Could not convert provided tensor data type {} to "
                            "desired data type {}",
-                           x.data_type,
+                           tensor.data_type,
                            data_type);
   }
   return x;
@@ -82,7 +88,8 @@ tensor_guid_t ComputationGraphBuilder::element_unary(
       this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   Layer layer = {attrs, name};
-  TensorShape output_shape = get_output_shape(attrs, input);
+  TensorShape output_shape =
+      get_output_shape(attrs, computation_graph.at(input));
 
   return this->add_layer(layer, {input}, {}, output_shape);
 }
@@ -97,7 +104,8 @@ tensor_guid_t ComputationGraphBuilder::element_scalar_unary(
       this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
   Layer layer = {attrs, name};
-  TensorShape output_shape = get_output_shape(attrs, input);
+  TensorShape output_shape =
+      get_output_shape(attrs, computation_graph.at(input));
 
   return this->add_layer(layer, {input}, {}, output_shape);
 }
@@ -126,8 +134,12 @@ tensor_guid_t ComputationGraphBuilder::element_binary(
     std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(op_type));
 
-  TensorShape compute_shape = this->get_broadcast_target_shape({lhs, rhs});
-  DataType compute_type = std::max(lhs.data_type, rhs.data_type);
+  Tensor lhs_tensor = computation_graph.at(lhs);
+  Tensor rhs_tensor = computation_graph.at(rhs);
+
+  TensorShape compute_shape =
+      this->get_broadcast_target_shape({lhs_tensor, rhs_tensor});
+  DataType compute_type = std::max(lhs_tensor.data_type, rhs_tensor.data_type);
 
   tensor_guid_t const lhs_input =
       this->as_type(this->broadcast(lhs, compute_shape),
@@ -141,7 +153,8 @@ tensor_guid_t ComputationGraphBuilder::element_binary(
   ElementBinaryAttrs attrs = {op_type, compute_type, false, false};
 
   Layer layer = {attrs, name};
-  TensorShape output_shape = get_output_shape(attrs, lhs_input, rhs_input);
+  TensorShape output_shape = get_output_shape(
+      attrs, computation_graph.at(lhs_input), computation_graph.at(rhs_input));
 
   return this->add_layer(layer, {lhs_input, rhs_input}, {}, output_shape);
 }
@@ -162,18 +175,20 @@ tensor_guid_t ComputationGraphBuilder::dense(
   tensor_guid_t input_recast =
       this->as_type(input, data_type, unwrapped_name + "input_recast");
 
+  Tensor input_recast_tensor = computation_graph.at(input_recast);
   Layer layer = {attrs, name};
-  TensorShape output_shape = get_output_shape(attrs, input_recast);
+  TensorShape output_shape = get_output_shape(attrs, input_recast_tensor);
   Tensor output = {
       output_shape.dims, data_type, std::nullopt, false, std::nullopt};
 
   std::vector<std::pair<TensorShape, std::optional<Initializer>>> weights;
 
   weights.push_back(
-      {get_weights_shape(attrs, input_recast), kernel_initializer});
+      {get_weights_shape(attrs, input_recast_tensor), kernel_initializer});
 
   if (use_bias) {
-    weights.push_back({get_bias_shape(attrs, input_recast), bias_initializer});
+    weights.push_back(
+        {get_bias_shape(attrs, input_recast_tensor), bias_initializer});
   }
 
   return this->add_layer(layer, {input_recast}, weights, output);
@@ -348,15 +363,18 @@ tensor_guid_t ComputationGraphBuilder::conv2d(
   tensor_guid_t input =
       this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
+  Tensor input_tensor = computation_graph.at(input);
+
   Layer layer = {attrs, name};
-  TensorShape output_shape = get_output_shape(attrs, input);
+  TensorShape output_shape = get_output_shape(attrs, input_tensor);
 
   std::vector<std::pair<TensorShape, std::optional<Initializer>>> weights;
 
-  weights.push_back({get_kernel_shape(attrs, input), kernel_initializer});
+  weights.push_back(
+      {get_kernel_shape(attrs, input_tensor), kernel_initializer});
 
   if (use_bias) {
-    weights.push_back({get_bias_shape(attrs, input), bias_initializer});
+    weights.push_back({get_bias_shape(attrs, input_tensor), bias_initializer});
   }
 
   return this->add_layer(layer, {input}, weights, output_shape);
@@ -374,7 +392,8 @@ tensor_guid_t ComputationGraphBuilder::dropout(
   tensor_guid_t input =
       this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
-  TensorShape output_shape = get_output_shape(attrs, input);
+  TensorShape output_shape =
+      get_output_shape(attrs, computation_graph.at(input));
 
   return this->add_layer(layer, {input}, {}, output_shape);
 }
@@ -394,8 +413,9 @@ tensor_guid_t ComputationGraphBuilder::embedding(
   tensor_guid_t input =
       this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
 
-  TensorShape output_shape = get_output_shape(attrs, input);
-  TensorShape weights_shape = get_weights_shape(attrs, input);
+  Tensor input_tensor = computation_graph.at(input);
+  TensorShape output_shape = get_output_shape(attrs, input_tensor);
+  TensorShape weights_shape = get_weights_shape(attrs, input_tensor);
 
   return this->add_layer(
       layer, {input}, {{weights_shape, kernel_initializer}}, output_shape);
@@ -410,16 +430,17 @@ std::vector<tensor_guid_t> ComputationGraphBuilder::gather(
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
   Layer layer = {attrs, name};
-  if (index.data_type != DataType::INT32 &&
-      index.data_type != DataType::INT64) {
+  Tensor index_tensor = computation_graph.at(index);
+  if (index_tensor.data_type != DataType::INT32 &&
+      index_tensor.data_type != DataType::INT64) {
     throw mk_runtime_error("Invalid data type for input tensor 2 for Gather: "
                            "{} (should be {} or {})",
-                           input.data_type,
+                           index_tensor.data_type,
                            DataType::INT32,
                            DataType::INT64);
   }
   std::vector<TensorShape> output_shapes =
-      get_output_shapes(attrs, input, index);
+      get_output_shapes(attrs, computation_graph.at(input), index_tensor);
 
   return this->add_layer(layer, {input}, {}, output_shapes);
 }
@@ -428,17 +449,18 @@ tensor_guid_t
     ComputationGraphBuilder::input(Tensor const &input_tensor,
                                    std::optional<std::string> const &name) {
   InputAttrs input_attrs = {};
-  std::string name = name.value_or(get_default_name(input_attrs));
+  std::string str_name = name.value_or(get_default_name(input_attrs));
 
-  Layer layer = {attrs, name};
+  Layer layer = {input_attrs, str_name};
 
   return this->add_layer(layer, {}, {}, input_tensor);
 }
 
-TensorShape get_shape(tensor_guid_t const &t) {
-  return this->computation_graph.at(t).get_shape();
+TensorShape ComputationGraphBuilder::get_shape(tensor_guid_t const &t) {
+  return computation_graph.at(t).get_shape();
 }
-std::vector<TensorShape> get_shape(std::vector<tensor_guid_t> const &) {
+std::vector<TensorShape>
+    ComputationGraphBuilder::get_shapes(std::vector<tensor_guid_t> const &) {
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/pcg/src/tensor.cc b/lib/pcg/src/tensor.cc
index a5aa4b0d0c..df29ee0065 100644
--- a/lib/pcg/src/tensor.cc
+++ b/lib/pcg/src/tensor.cc
@@ -10,4 +10,8 @@ TensorShape Tensor::get_shape() const {
   return TensorShape(*this);
 }
 
+Tensor construct_tensor_from_output_shape(TensorShape const &shape) {
+  return Tensor{shape.dims, shape.data_type, std::nullopt, false, std::nullopt};
+}
+
 } // namespace FlexFlow

From 784742c921dc3bcc9216f9513789319a7cd958e4 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 21 May 2024 19:09:43 -0700
Subject: [PATCH 08/24] Add task spec

---
 .../include}/arg_ref.h                        |  32 +--
 lib/local-execution/include/concrete_arg.h    |  55 +++++
 .../include}/config.h                         |  24 ++-
 .../include}/cost_metrics.h                   |   4 +-
 .../include}/device_specific.h                |  15 +-
 lib/local-execution/include/local_allocator.h |   4 +-
 .../include}/op_arg_ref.h                     |   4 +-
 .../include}/op_task_invocation.h             | 105 +++++-----
 .../include}/op_task_signature.h              |  64 +++---
 .../include}/op_tensor_spec.h                 |   7 +-
 .../include}/permissions.h                    |   9 +-
 .../include}/profiling.h                      |  17 +-
 .../include}/runtime_arg_ref.h                |  16 +-
 .../include}/serialization.h                  | 130 +-----------
 .../include}/sim_environment.h                |   9 +-
 .../include}/slot_id.h                        |   4 +-
 .../include}/slot_type.h                      |   4 +-
 .../include/task_argument_accessor.h          | 155 ++++++++++++++
 .../src => local-execution/include}/tasks.h   |  11 +-
 .../include}/variadic_tensor_ref.h            |   4 +-
 .../src}/op_task_invocation.cc                |   6 -
 lib/local-execution/src/op_task_signature.cc  |  81 ++++++++
 .../src/permissions.cc                        |  30 ---
 .../include/runtime/task_spec/concrete_arg.h  |  46 -----
 .../src/task_spec/task_argument_accessor.h    | 193 ------------------
 25 files changed, 482 insertions(+), 547 deletions(-)
 rename lib/{runtime/src/task_spec => local-execution/include}/arg_ref.h (52%)
 create mode 100644 lib/local-execution/include/concrete_arg.h
 rename lib/{runtime/include/runtime => local-execution/include}/config.h (89%)
 rename lib/{runtime/src => local-execution/include}/cost_metrics.h (95%)
 rename lib/{runtime/src/task_spec => local-execution/include}/device_specific.h (67%)
 rename lib/{runtime/src/task_spec => local-execution/include}/op_arg_ref.h (84%)
 rename lib/{runtime/src/task_spec => local-execution/include}/op_task_invocation.h (50%)
 rename lib/{runtime/src/task_spec => local-execution/include}/op_task_signature.h (71%)
 rename lib/{runtime/src/task_spec => local-execution/include}/op_tensor_spec.h (55%)
 rename lib/{runtime/src => local-execution/include}/permissions.h (84%)
 rename lib/{runtime/include/runtime => local-execution/include}/profiling.h (57%)
 rename lib/{runtime/src/task_spec => local-execution/include}/runtime_arg_ref.h (56%)
 rename lib/{runtime/src => local-execution/include}/serialization.h (55%)
 rename lib/{runtime/src => local-execution/include}/sim_environment.h (95%)
 rename lib/{runtime/include/runtime/task_spec => local-execution/include}/slot_id.h (73%)
 rename lib/{runtime/src/task_spec => local-execution/include}/slot_type.h (86%)
 create mode 100644 lib/local-execution/include/task_argument_accessor.h
 rename lib/{runtime/src => local-execution/include}/tasks.h (95%)
 rename lib/{runtime/src/task_spec => local-execution/include}/variadic_tensor_ref.h (72%)
 rename lib/{runtime/src/task_spec => local-execution/src}/op_task_invocation.cc (85%)
 create mode 100644 lib/local-execution/src/op_task_signature.cc
 rename lib/{runtime => local-execution}/src/permissions.cc (67%)
 delete mode 100644 lib/runtime/include/runtime/task_spec/concrete_arg.h
 delete mode 100644 lib/runtime/src/task_spec/task_argument_accessor.h

diff --git a/lib/runtime/src/task_spec/arg_ref.h b/lib/local-execution/include/arg_ref.h
similarity index 52%
rename from lib/runtime/src/task_spec/arg_ref.h
rename to lib/local-execution/include/arg_ref.h
index 62f89f0b5c..67e8a47404 100644
--- a/lib/runtime/src/task_spec/arg_ref.h
+++ b/lib/local-execution/include/arg_ref.h
@@ -1,9 +1,9 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_ARG_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_ARG_REF_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H
 
 #include "kernels/ff_handle.h"
-#include "runtime/profiling.h"
-#include "runtime/task_spec/arg_type_runtime_tag.h"
+#include "profiling.h"
+#include "serialization.h"
 #include "utils/type_index.h"
 #include "utils/visitable.h"
 
@@ -21,37 +21,43 @@ struct ArgRefSpec {
 
   template <typename T>
   bool holds() const {
-    return this->type_tag.template matches<T>();
+    // return this->type_tag.template matches<T>();
+
+    return matches<T>(this->type_idx);
   }
 
   LABEL_TYPE const &get_ref_type() const {
     return this->ref_type;
   }
 
-  ArgTypeRuntimeTag get_type_tag() const {
-    return this->type_tag;
+  // TODO - how to extend this for legion runtime?
+  // ArgTypeRuntimeTag get_type_tag() const {
+  //   return this->type_tag;
+  // }
+  std::type_index get_type_index() const {
+    return this->type_idx;
   }
 
   template <typename T>
   static ArgRefSpec create(ArgRef<LABEL_TYPE, T> const &r) {
     static_assert(is_serializable<T>::value, "Type must be serializeable");
 
-    return ArgRefSpec(ArgTypeRuntimeTag::create<T>(), r.ref_type);
+    return ArgRefSpec(init_type_index<T>(), r.ref_type);
   }
 
   template <typename T>
   static ArgRefSpec create_device_specific(ArgRef<LABEL_TYPE, T> const &r,
                                            size_t device_idx) {
-    return ArgRefSpec(ArgTypeRuntimeTag::create<T>(), r.ref_type, device_idx);
+    return ArgRefSpec(init_type_index<T>(), r.ref_type, device_idx);
   }
 
 private:
-  ArgRefSpec(ArgTypeRuntimeTag const &type_tag, LABEL_TYPE ref_type)
-      : type_tag(type_tag), ref_type(ref_type) {}
+  ArgRefSpec(std::type_index const &type_index, LABEL_TYPE ref_type)
+      : type_idx(type_index), ref_type(ref_type) {}
 
-  ArgTypeRuntimeTag type_tag;
+  std::type_index type_idx;
   LABEL_TYPE ref_type;
-  optional<size_t> device_idx = nullopt;
+  std::optional<size_t> device_idx = std::nullopt;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/concrete_arg.h b/lib/local-execution/include/concrete_arg.h
new file mode 100644
index 0000000000..522d21485e
--- /dev/null
+++ b/lib/local-execution/include/concrete_arg.h
@@ -0,0 +1,55 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
+#define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
+
+#include "serialization.h"
+#include "utils/type_index.h"
+#include <memory>
+
+namespace FlexFlow {
+
+struct ConcreteArgSpec {
+public:
+  ConcreteArgSpec() = delete;
+
+  template <typename T>
+  T const &get() const {
+    assert(matches<T>(this->type_idx));
+
+    return *(T const *)ptr.get();
+  }
+
+  // ArgTypeRuntimeTag get_type_tag() const {
+  //   return this->type_tag;
+  // }
+  // size_t serialize(Legion::Serializer &) const;
+
+  std::type_index get_type_index() const {
+    return this->type_idx;
+  }
+
+  template <typename T>
+  static ConcreteArgSpec create(T const &t) {
+    static_assert(is_serializable<T>::value, "Type must be serializable");
+
+    std::type_index type_idx = init_type_index<T>();
+    std::shared_ptr<void const> ptr =
+        std::static_pointer_cast<void const>(std::make_shared<T>(t));
+
+    return ConcreteArgSpec(type_idx, ptr);
+    // ArgTypeRuntimeTag::create<T>());
+  }
+
+private:
+  ConcreteArgSpec(std::type_index const &type_index,
+                  std::shared_ptr<void const> ptr)
+      : type_idx(type_index), ptr(ptr) {}
+  // ArgTypeRuntimeTag const &);
+
+  // ArgTypeRuntimeTag type_tag;
+  std::type_index type_idx;
+  std::shared_ptr<void const> ptr;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/include/runtime/config.h b/lib/local-execution/include/config.h
similarity index 89%
rename from lib/runtime/include/runtime/config.h
rename to lib/local-execution/include/config.h
index 34f45040d1..73653aebae 100644
--- a/lib/runtime/include/runtime/config.h
+++ b/lib/local-execution/include/config.h
@@ -13,12 +13,11 @@
  * limitations under the License.
  */
 
-#ifndef _FLEXFLOW_CONFIG_H_
-#define _FLEXFLOW_CONFIG_H_
-#include "legion.h"
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_
+#define _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_
+
 #include "op-attrs/param_sync.h"
 #include "utils/fmt.h"
-#include "utils/optional.h"
 #include "utils/visitable.h"
 #include <cstring>
 
@@ -47,6 +46,8 @@ struct FFInitInfo : public use_visitable_cmp<FFInitInfo> {
   bool allowTensorOpMathConversion;
 };
 
+using legion_mapping_tag_id_t = unsigned long;
+
 struct FFConfig : public use_visitable_cmp<FFConfig> {
 public:
   enum PreservedIDs {
@@ -64,7 +65,7 @@ struct FFConfig : public use_visitable_cmp<FFConfig> {
   };
 
   FFConfig() = default;
-  static Legion::MappingTagID get_hash_id(std::string const &pcname);
+  static legion_mapping_tag_id_t get_hash_id(std::string const &pcname);
 
 public:
   int epochs = 1;
@@ -88,16 +89,17 @@ struct FFConfig : public use_visitable_cmp<FFConfig> {
   bool enable_inplace_optimizations = false;
   // Control Tensor Op Math Conversion
   bool allow_tensor_op_math_conversion = false;
-  optional<std::string> dataset_path = nullopt;
-  optional<std::string> export_strategy_computation_graph_file = nullopt;
+  std::optional<std::string> dataset_path = std::nullopt;
+  std::optional<std::string> export_strategy_computation_graph_file =
+      std::nullopt;
   bool include_costs_dot_graph = false;
-  optional<std::string> substitution_json_path = nullopt;
+  std::optional<std::string> substitution_json_path = std::nullopt;
   int machine_model_version = 0;
-  optional<std::string> machine_model_file = nullopt;
+  std::optional<std::string> machine_model_file = std::nullopt;
   int simulator_segment_size = 16777216; // 16 MB
   int simulator_max_num_segments = 1;
-  optional<int> search_num_nodes = nullopt;
-  optional<int> search_num_workers = nullopt;
+  std::optional<int> search_num_nodes = std::nullopt;
+  std::optional<int> search_num_workers = std::nullopt;
   int base_optimize_threshold = 10;
   bool enable_control_replication = true;
   // The default python data loader type is 2 to enable control replication
diff --git a/lib/runtime/src/cost_metrics.h b/lib/local-execution/include/cost_metrics.h
similarity index 95%
rename from lib/runtime/src/cost_metrics.h
rename to lib/local-execution/include/cost_metrics.h
index 77526ccd1a..edc0190daf 100644
--- a/lib/runtime/src/cost_metrics.h
+++ b/lib/local-execution/include/cost_metrics.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H
-#define _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H
 
 #include "utils/visitable.h"
 
diff --git a/lib/runtime/src/task_spec/device_specific.h b/lib/local-execution/include/device_specific.h
similarity index 67%
rename from lib/runtime/src/task_spec/device_specific.h
rename to lib/local-execution/include/device_specific.h
index e29e4e9450..a055f6d274 100644
--- a/lib/runtime/src/task_spec/device_specific.h
+++ b/lib/local-execution/include/device_specific.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H
-#define _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
+#define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
 
 #include "serialization.h"
 #include "utils/exception.h"
@@ -10,10 +10,17 @@ template <typename T>
 struct DeviceSpecific {
 
   DeviceSpecific() = delete;
+  DeviceSpecific(T ptr_type) { // accessor
+    size_t device_idx = 0;
+    DeviceSpecific<T> device_specific =
+        DeviceSpecific::create(device_idx, ptr_type);
+    this->ptr = device_specific.ptr;
+    this->device_idx = device_specific.device_idx;
+  }
 
   template <typename... Args>
   static DeviceSpecific<T> create(size_t device_idx, Args &&...args) {
-    NOT_IMPLEMENTED();
+    NOT_IMPLEMENTED(); // accessor
   }
 
   T const *get(size_t curr_device_idx) const {
@@ -26,6 +33,8 @@ struct DeviceSpecific {
     return this->ptr;
   }
 
+  // TODO: can modify ptr
+
 private:
   T *ptr;
   size_t device_idx;
diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local_allocator.h
index f4b253b281..b47220eb8c 100644
--- a/lib/local-execution/include/local_allocator.h
+++ b/lib/local-execution/include/local_allocator.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
-#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H
 
 #include "kernels/allocation.h"
 #include <unordered_set>
diff --git a/lib/runtime/src/task_spec/op_arg_ref.h b/lib/local-execution/include/op_arg_ref.h
similarity index 84%
rename from lib/runtime/src/task_spec/op_arg_ref.h
rename to lib/local-execution/include/op_arg_ref.h
index 3e931d79a4..02b354b221 100644
--- a/lib/runtime/src/task_spec/op_arg_ref.h
+++ b/lib/local-execution/include/op_arg_ref.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
 
 #include "arg_ref.h"
 #include "device_specific.h"
diff --git a/lib/runtime/src/task_spec/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h
similarity index 50%
rename from lib/runtime/src/task_spec/op_task_invocation.h
rename to lib/local-execution/include/op_task_invocation.h
index 56e709734e..2079fabcbc 100644
--- a/lib/runtime/src/task_spec/op_task_invocation.h
+++ b/lib/local-execution/include/op_task_invocation.h
@@ -1,49 +1,46 @@
 #ifndef _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H
 #define _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H
 
-#include "accessor.h"
-#include "index_task_invocation.h"
-#include "legion.h"
+#include "concrete_arg.h"
+#include "kernels/accessor.h"
 #include "op_arg_ref.h"
 #include "op_task_signature.h"
 #include "op_tensor_spec.h"
-#include "runtime/config.h"
-#include "runtime/profiling.h"
+#include "profiling.h"
+#include "runtime_arg_ref.h"
 #include "serialization.h"
-#include "standard_task_invocation.h"
 #include "tasks.h"
 #include "utils/bidict.h"
-#include "utils/optional.h"
 #include "utils/stack_map.h"
 #include "variadic_tensor_ref.h"
 #include <typeindex>
 #include <unordered_map>
 #include <unordered_set>
+#include <variant>
 
 namespace FlexFlow {
 
 enum class IsTrainable { YES, NO };
 
-using OpArgSpec = variant<ConcreteArgSpec,
-                          IndexArgSpec,
-                          OpArgRefSpec,
-                          CheckedTypedFuture,
-                          CheckedTypedFutureMap,
-                          RuntimeArgRefSpec,
-                          TaskInvocationSpec>;
+using OpArgSpec =
+    std::variant<ConcreteArgSpec, OpArgRefSpec, RuntimeArgRefSpec>;
+
+struct OpArgSpecTypeAccessor {
+  std::type_index operator()(OpArgSpec &spec) {
+    return std::visit(
+        [](auto &&arg) -> std::type_index { return arg.get_type_index(); },
+        spec);
+  }
+};
 
 struct OpTaskBinding {
   OpTaskBinding() = default;
 
-  static_assert(is_subeq_variant<IndexTaskArgSpec, OpArgSpec>::value, "");
-
-  void bind(slot_id, OpTensorSpec const &);
-  void bind_grad(slot_id, OpTensorSpec const &);
-
-  template <typename T>
-  void bind(slot_id name, VariadicTensorRef<T> const &t) {
+  void bind(slot_id, VariadicTensorRef<OpTensorSpec> const &) {
     NOT_IMPLEMENTED();
   }
+  void bind(slot_id, OpTensorSpec const &);
+  void bind_grad(slot_id, OpTensorSpec const &);
 
   template <typename T>
   void bind_device_specific_arg(slot_id name, T const &t) {
@@ -70,46 +67,31 @@ struct OpTaskBinding {
     this->insert_arg_spec(name, OpArgRefSpec::create(ref));
   }
 
-  template <typename T>
-  void bind_arg(slot_id name, TypedFuture<T> const &f) {
-    this->insert_arg_spec(name, CheckedTypedFuture::create(f));
+  void bind_args_from_fwd(OpTaskBinding const &fwd) {
+    this->arg_bindings = fwd.get_arg_bindings();
   }
 
-  template <typename T>
-  void bind_arg(slot_id name, TypedFutureMap<T> const &fm) {
-    this->insert_arg_spec(name, CheckedTypedFutureMap::create(fm));
+  void bind_tensors_from_fwd(OpTaskBinding const &fwd) {
+    this->tensor_bindings = fwd.get_tensor_bindings();
   }
 
   std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id, OpArgSpec> const &get_arg_bindings() const;
 
-private:
   void insert_arg_spec(slot_id name, OpArgSpec const &arg_spec) {
     assert(!contains_key(this->arg_bindings, name));
     this->arg_bindings.insert({name, arg_spec});
   }
 
-  // template <typename T>
-  // ArgSpec generate_arg_spec(T const &t) {
-  //   static_assert(is_serializable<T>, "Type must be serializable");
-
-  //   size_t pre_size = serializer.get_used_bytes();
-  //   ff_task_serialize(serializer, t);
-  //   size_t post_size = serializer.get_used_bytes();
-  //   return {
-  //     typeid(T),
-  //     pre_size,
-  //     post_size - pre_size
-  //   };
-  // }
-
-  /* Legion::Serializer serializer; */
   std::unordered_map<slot_id, OpArgSpec> arg_bindings;
   std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> tensor_bindings;
 };
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskBinding,
+                                             arg_bindings,
+                                             tensor_bindings);
 
-struct OpTaskInvocation : public use_visitable_cmp<OpTaskInvocation> {
+struct OpTaskInvocation {
 public:
   OpTaskInvocation() = delete;
   OpTaskInvocation(task_id_t const &task_id, OpTaskBinding const &binding)
@@ -119,16 +101,41 @@ struct OpTaskInvocation : public use_visitable_cmp<OpTaskInvocation> {
   task_id_t task_id;
   OpTaskBinding binding;
 };
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskInvocation,
+                                             task_id,
+                                             binding);
 
 OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd);
 OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
-OpTaskSignature get_op_signature(task_id_t const &);
 
-/* std::unordered_map<int, OpTensorSpec> get_regions_idxs(TaskArgumentFormat
- * const &); */
+bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) {
+  // tensors
+  auto tensor_bindings = inv.binding.get_tensor_bindings();
+  for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
+    slot_id name = op_tensor_slot_spec.name;
+    IsGrad is_grad = op_tensor_slot_spec.is_grad;
+    std::pair<slot_id, IsGrad> tensor_key = std::make_pair(name, is_grad);
+    OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key);
+    if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role ||
+        op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) {
+      return false;
+    }
+  }
+
+  // args
+  auto sig_arg_types = sig.get_arg_types();
+  OpArgSpecTypeAccessor type_accessor;
+  for (auto arg_binding : inv.binding.get_arg_bindings()) {
+    slot_id name = arg_binding.first;
+    OpArgSpec op_arg_spec = arg_binding.second;
+    std::type_index arg_type = sig_arg_types.at(name);
+    if (type_accessor(op_arg_spec) != arg_type) {
+      return false;
+    }
+  }
 
-/* TaskArgumentFormat compile_task_invocation(OpTaskSignature const &,
- * OpTaskBinding const &); */
+  return true;
+}
 
 } // namespace FlexFlow
 
diff --git a/lib/runtime/src/task_spec/op_task_signature.h b/lib/local-execution/include/op_task_signature.h
similarity index 71%
rename from lib/runtime/src/task_spec/op_task_signature.h
rename to lib/local-execution/include/op_task_signature.h
index 656df39309..626266d10f 100644
--- a/lib/runtime/src/task_spec/op_task_signature.h
+++ b/lib/local-execution/include/op_task_signature.h
@@ -1,8 +1,11 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H
-#define _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H
-
-#include "task_invocation.h"
-#include "task_signature.h"
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
+
+#include "serialization.h"
+#include "slot_id.h"
+#include "slot_type.h"
+#include "tasks.h"
+#include "utils/type_index.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
@@ -14,6 +17,7 @@ enum class TensorRole {
 };
 
 enum class OpTaskType { INIT, FWD, BWD };
+enum class IsGrad { YES, NO };
 
 enum class OpSlotOptions {
   OPTIONAL,
@@ -25,7 +29,6 @@ enum class OpSlotOptions {
 struct OpTensorSlotSpec {
 public:
   OpTensorSlotSpec() = delete;
-  OpTensorSlotSpec(slot_id, SlotType, TensorRole);
 
 public:
   slot_id name;
@@ -38,10 +41,12 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
     OpTensorSlotSpec, name, slot_type, tensor_role, is_grad, slot_option);
 
 struct OpTaskSignature {
-  OpTaskSignature() = delete;
-  explicit OpTaskSignature(OpTaskType);
+  OpTaskSignature() = default;
+  // explicit OpTaskSignature(OpTaskType);
 
-  OpTaskType get_task_type() const;
+  OpTaskType get_task_type() const {
+    return this->type;
+  }
 
   void add_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR);
   void add_optional_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR);
@@ -59,45 +64,39 @@ struct OpTaskSignature {
 
   void add_from_slot_spec(OpTensorSlotSpec const &spec);
 
-  /* void add_input_slot(slot_id, Legion::PrivilegeMode); */
-  /* void add_input_slot(slot_id, SlotType, Legion::PrivilegeMode); */
-
-  bool operator==(OpTaskSignature const &) const;
-  bool operator!=(OpTaskSignature const &) const;
-
   template <typename T>
   void add_arg_slot(slot_id name) {
     static_assert(is_serializable<T>::value, "Type must be serializable");
+    this->task_arg_types.insert({name, init_type_index<T>()});
   }
 
   template <typename T>
-  void add_return_value();
+  void add_return_value() {
+    // std::type_index return_value = init_type_index<T>();
+    this->return_value = init_type_index<T>();
+  }
 
   // adds arg_slot without checking is_serializable, used for arguments that are
   // deviceSpecific
   template <typename T>
   void add_unchecked_arg_slot(slot_id name) {
-    NOT_IMPLEMENTED();
+    this->task_arg_types.insert({name, init_type_index<T>()});
   }
 
   std::unordered_set<OpTensorSlotSpec> get_tensor_slots();
   void set_arg_types(std::unordered_map<slot_id, std::type_index> const &);
   std::unordered_map<slot_id, std::type_index> get_arg_types();
 
-private:
+  OpTaskType type;
+  std::optional<std::type_index> return_value;
   std::unordered_map<slot_id, std::type_index> task_arg_types;
   std::unordered_set<OpTensorSlotSpec> op_tensor_slots;
 };
-
-template <task_id_t>
-OpTaskSignature init_signature();
-template <task_id_t>
-OpTaskSignature fwd_signature();
-template <task_id_t>
-OpTaskSignature bwd_signature();
-
-template <task_id_t>
-OpTaskSignature get_signature();
+// FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskSignature,
+//                                             type,
+//                                             return_value,
+//                                             task_arg_types,
+//                                             op_tensor_slots);
 
 template <typename F>
 void register_task(task_id_t,
@@ -112,6 +111,15 @@ void register_task(task_id_t,
                    F const &func,
                    F const &cpu_func);
 
+template <task_id_t>
+OpTaskSignature init_signature();
+
+template <task_id_t>
+OpTaskSignature fwd_signature();
+
+template <task_id_t>
+OpTaskSignature bwd_signature();
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/runtime/src/task_spec/op_tensor_spec.h b/lib/local-execution/include/op_tensor_spec.h
similarity index 55%
rename from lib/runtime/src/task_spec/op_tensor_spec.h
rename to lib/local-execution/include/op_tensor_spec.h
index d859bb3072..c12b5342e1 100644
--- a/lib/runtime/src/task_spec/op_tensor_spec.h
+++ b/lib/local-execution/include/op_tensor_spec.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 
 #include "op_task_signature.h"
 
@@ -7,9 +7,10 @@ namespace FlexFlow {
 
 struct OpTensorSpec {
   TensorRole role;
+  OpSlotOptions slot_option;
   req<int> idx;
 };
-FF_VISITABLE_STRUCT(OpTensorSpec, role, idx);
+FF_VISITABLE_STRUCT(OpTensorSpec, role, slot_option, idx);
 
 OpTensorSpec input_tensor(int);
 OpTensorSpec output_tensor(int);
diff --git a/lib/runtime/src/permissions.h b/lib/local-execution/include/permissions.h
similarity index 84%
rename from lib/runtime/src/permissions.h
rename to lib/local-execution/include/permissions.h
index e7793a1dcb..ce19e38e7e 100644
--- a/lib/runtime/src/permissions.h
+++ b/lib/local-execution/include/permissions.h
@@ -1,18 +1,13 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_PERMISSION_H
-#define _FLEXFLOW_RUNTIME_SRC_PERMISSION_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
 
-#include "legion.h"
 #include "utils/exception.h"
 #include "utils/fmt.h"
-#include "utils/optional.h"
 
 namespace FlexFlow {
 
 enum class Permissions { NONE, RO, WO, RW };
 
-Legion::PrivilegeMode to_legion(Permissions);
-optional<Permissions> from_legion(Legion::PrivilegeMode);
-
 Permissions join(Permissions lhs, Permissions rhs);
 Permissions meet(Permissions lhs, Permissions rhs);
 
diff --git a/lib/runtime/include/runtime/profiling.h b/lib/local-execution/include/profiling.h
similarity index 57%
rename from lib/runtime/include/runtime/profiling.h
rename to lib/local-execution/include/profiling.h
index 3f43ede520..066cdc8404 100644
--- a/lib/runtime/include/runtime/profiling.h
+++ b/lib/local-execution/include/profiling.h
@@ -1,22 +1,21 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_PROFILING_H
-#define _FLEXFLOW_RUNTIME_SRC_PROFILING_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H
 
 #include "kernels/profiling.h"
-#include "legion.h"
-#include "loggers.h"
 
 namespace FlexFlow {
 
 enum class EnableProfiling { YES, NO };
 
 template <typename F, typename... Ts, typename Str>
-optional<float>
+std::optional<float>
     profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) {
-  optional<float> elapsed =
+  std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
-  if (elapsed.has_value()) {
-    log_profile.debug(s, elapsed.value());
-  }
+  // TODO -- local logger?
+  // if (elapsed.has_value()) {
+  //   log_profile.debug(s, elapsed.value());
+  // }
   return elapsed;
 }
 
diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.h b/lib/local-execution/include/runtime_arg_ref.h
similarity index 56%
rename from lib/runtime/src/task_spec/runtime_arg_ref.h
rename to lib/local-execution/include/runtime_arg_ref.h
index 655300e692..1493531dc3 100644
--- a/lib/runtime/src/task_spec/runtime_arg_ref.h
+++ b/lib/local-execution/include/runtime_arg_ref.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 
 #include "arg_ref.h"
+#include "config.h"
 #include "device_specific.h"
-#include "runtime/config.h"
 
 namespace FlexFlow {
 
@@ -18,9 +18,17 @@ using RuntimeArgRef = ArgRef<RuntimeArgRefType, T>;
 
 using RuntimeArgRefSpec = ArgRefSpec<RuntimeArgRefType>;
 
-RuntimeArgRef<ProfilingSettings> profiling_settings();
-RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
-RuntimeArgRef<FFIterationConfig> iteration_config();
+RuntimeArgRef<ProfilingSettings> profiling_settings() {
+  return {RuntimeArgRefType::PROFILING_SETTINGS};
+}
+
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
+  return {RuntimeArgRefType::FF_HANDLE};
+}
+
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
+  return {RuntimeArgRefType::FF_ITERATION_CONFIG};
+}
 
 } // namespace FlexFlow
 
diff --git a/lib/runtime/src/serialization.h b/lib/local-execution/include/serialization.h
similarity index 55%
rename from lib/runtime/src/serialization.h
rename to lib/local-execution/include/serialization.h
index 65601990b0..147ed8159c 100644
--- a/lib/runtime/src/serialization.h
+++ b/lib/local-execution/include/serialization.h
@@ -1,12 +1,9 @@
-#ifndef _FLEXFLOW_RUNTIME_SERIALIZATION_H
-#define _FLEXFLOW_RUNTIME_SERIALIZATION_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H
 
 #include "kernels/device.h"
 #include "kernels/nccl.h"
-#include "legion.h"
-#include "legion/legion_utilities.h"
 #include "op-attrs/dim_ordered.h"
-#include "utils/optional.h"
 #include "utils/required.h"
 #include "utils/type_traits.h"
 #include "utils/variant.h"
@@ -28,23 +25,6 @@ namespace FlexFlow {
 template <typename T>
 struct needs_serialization {};
 
-/* template <typename T> */
-/* class Serializer { */
-/*   void serialize(Legion::Serializer &, T const &) const; */
-/*   void deserialize(Legion::Deserializer &, T &) const; */
-/* }; */
-
-/* template <typename T, class Enable = void> struct trivially_serializable; */
-
-/* template <typename T, int i, class Enable = void> struct
- * visit_trivially_serializable; */
-
-/* template <typename T, */
-/*           int i, */
-/*           typename
- * std::enable_if<(needs_serialization<visit_struct::type_at<i, T>>::value &&
- * visit_serializable<T, (i+1)>::value)>::type> */
-
 template <typename... Args>
 struct visit_trivially_serializable;
 
@@ -101,6 +81,10 @@ struct is_trivially_serializable<
     typename std::enable_if<std::is_floating_point<T>::value>::type>
     : std::true_type {};
 
+template <typename T, std::size_t MAXSIZE>
+struct is_trivially_serializable<stack_vector<T, MAXSIZE>>
+    : is_trivially_serializable<T> {};
+
 template <typename Idx, typename T>
 struct is_trivially_serializable<DimOrdered<Idx, T>>
     : is_trivially_serializable<T> {};
@@ -155,108 +139,6 @@ static_assert(std::is_same<visit_as_tuple_t<InternalTestType>,
 static_assert(visit_trivially_serializable<InternalTestType>::value, "");
 static_assert(is_trivially_serializable<InternalTestType>::value, "");
 
-template <typename T, typename Enable = void>
-struct Serialization {
-  void serialize(Legion::Serializer &, T const &) const;
-  T deserialize(Legion::Deserializer &) const;
-};
-
-template <typename T>
-struct Serialization<
-    T,
-    typename std::enable_if<is_trivially_serializable<T>::value>::type> {
-  static void serialize(Legion::Serializer &sez, T const &t) {
-    sez.serialize(&t, sizeof(T));
-  }
-
-  static T const &deserialize(Legion::Deserializer &dez) {
-    void const *cur = dez.get_current_pointer();
-    dez.advance_pointer(sizeof(T));
-    return *(T const *)cur;
-  }
-};
-
-struct needs_serialize_visitor {
-  bool result = true;
-
-  template <typename T>
-  void operator()(char const *, T const &t) {
-    result &= needs_serialize(t);
-  }
-};
-
-template <typename T>
-bool visit_needs_serialize(T const &t) {
-  needs_serialize_visitor vis;
-  visit_struct::for_each(t, vis);
-  return vis.result;
-}
-
-struct serialize_visitor {
-  serialize_visitor() = delete;
-  explicit serialize_visitor(Legion::Serializer &sez) : sez(sez) {}
-
-  Legion::Serializer &sez;
-
-  template <typename T>
-  void operator()(char const *, T const &t) {
-    serialize(this->sez, t);
-  }
-};
-
-template <typename T>
-void visit_serialize(Legion::Serializer &sez, T const &t) {
-  serialize_visitor vis(sez);
-  visit_struct::for_each(t, vis);
-}
-
-struct deserialize_visitor {
-  deserialize_visitor() = delete;
-  explicit deserialize_visitor(Legion::Deserializer &dez) : dez(dez) {}
-
-  Legion::Deserializer &dez;
-
-  template <typename T>
-  T const &operator()(char const *, T &t) {
-    deserialize(dez, t);
-  }
-};
-
-template <typename T>
-T const &visit_deserialize(Legion::Deserializer &dez) {
-  deserialize_visitor vis(dez);
-  return visit_struct::for_each<T>(vis);
-}
-
-template <typename T>
-class VisitSerialize {
-  void serialize(Legion::Serializer &sez, T const &t) const {
-    return visit_serialize(sez, t);
-  }
-
-  T const &deserialize(Legion::Deserializer &dez) const {
-    return visit_deserialize<T>(dez);
-  }
-};
-
-template <typename T>
-size_t ff_task_serialize(Legion::Serializer &sez, T const &t) {
-  static_assert(is_serializable<T>::value, "Type must be serializable");
-
-  size_t pre_size = sez.get_used_bytes();
-  Serialization<T>::serialize(sez, t);
-  size_t post_size = sez.get_used_bytes();
-
-  return post_size - pre_size;
-}
-
-template <typename T>
-T const &ff_task_deserialize(Legion::Deserializer &dez) {
-  static_assert(is_serializable<T>::value, "Type must be serializable");
-
-  return Serialization<T>::deserialize(dez);
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/runtime/src/sim_environment.h b/lib/local-execution/include/sim_environment.h
similarity index 95%
rename from lib/runtime/src/sim_environment.h
rename to lib/local-execution/include/sim_environment.h
index 4297d9d970..4409ab8b55 100644
--- a/lib/runtime/src/sim_environment.h
+++ b/lib/local-execution/include/sim_environment.h
@@ -1,12 +1,13 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H
-#define _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
+#define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
 
 #include "cost_metrics.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "task_spec/op_task_invocation.h"
-#include "task_spec/task_argument_accessor.h"
+#include "op_task_invocation.h"
+#include "pcg/machine_view.h"
+#include "task_argument_accessor.h"
 #include <vector>
 
 namespace FlexFlow {
diff --git a/lib/runtime/include/runtime/task_spec/slot_id.h b/lib/local-execution/include/slot_id.h
similarity index 73%
rename from lib/runtime/include/runtime/task_spec/slot_id.h
rename to lib/local-execution/include/slot_id.h
index a5e4322d3c..53820fdb2f 100644
--- a/lib/runtime/include/runtime/task_spec/slot_id.h
+++ b/lib/local-execution/include/slot_id.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H
 
 #include "utils/strong_typedef.h"
 
diff --git a/lib/runtime/src/task_spec/slot_type.h b/lib/local-execution/include/slot_type.h
similarity index 86%
rename from lib/runtime/src/task_spec/slot_type.h
rename to lib/local-execution/include/slot_type.h
index 64b79ee281..957f89fa4e 100644
--- a/lib/runtime/src/task_spec/slot_type.h
+++ b/lib/local-execution/include/slot_type.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H
 
 #include "utils/fmt.h"
 
diff --git a/lib/local-execution/include/task_argument_accessor.h b/lib/local-execution/include/task_argument_accessor.h
new file mode 100644
index 0000000000..0656af0fe3
--- /dev/null
+++ b/lib/local-execution/include/task_argument_accessor.h
@@ -0,0 +1,155 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
+
+#include "arg_ref.h"
+#include "concrete_arg.h"
+#include "config.h"
+#include "device_specific.h"
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/linear_kernels.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op_task_signature.h"
+#include "permissions.h"
+#include "tasks.h"
+#include "utils/variant.h"
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+namespace FlexFlow {
+
+template <Permissions>
+struct privilege_mode_to_accessor_t {};
+
+template <>
+struct privilege_mode_to_accessor_t<Permissions::RW> {
+  using type = GenericTensorAccessorW;
+};
+
+template <>
+struct privilege_mode_to_accessor_t<Permissions::RO> {
+  using type = GenericTensorAccessorR;
+};
+
+template <>
+struct privilege_mode_to_accessor_t<Permissions::WO> {
+  using type = GenericTensorAccessorW;
+};
+
+template <Permissions PRIV>
+using privilege_mode_to_accessor =
+    typename privilege_mode_to_accessor_t<PRIV>::type;
+
+using PrivilegeType =
+    std::variant<GenericTensorAccessorR, GenericTensorAccessorW>;
+using PrivilegeVariadicType = std::variant<std::vector<GenericTensorAccessorR>,
+                                           std::vector<GenericTensorAccessorW>>;
+
+// TODO: define device state variant in another file
+using DeviceStates = std::variant<LinearPerDeviceState>;
+
+using OpArgRefTypeBacking =
+    std::variant<ParallelTensorShape, DeviceSpecific<DeviceStates>>;
+using RuntimeArgRefTypeBacking = std::variant<ProfilingSettings,
+                                              DeviceSpecific<PerDeviceFFHandle>,
+                                              FFIterationConfig>;
+
+using ArgRefBacking = std::
+    variant<OpArgRefTypeBacking, RuntimeArgRefTypeBacking, ConcreteArgSpec>;
+
+struct ITaskArgumentAccessor {
+  ITaskArgumentAccessor &operator=(ITaskArgumentAccessor const &) = delete;
+
+  virtual ~ITaskArgumentAccessor() = default;
+
+  virtual ConcreteArgSpec const &get_concrete_arg(slot_id) const = 0;
+  virtual OpArgRefTypeBacking const &get_op_arg_ref(slot_id) const = 0;
+  virtual RuntimeArgRefTypeBacking const &get_runtime_arg(slot_id) const = 0;
+
+  virtual PrivilegeType
+      get_tensor(slot_id slot, Permissions priv, IsGrad is_grad) const = 0;
+  virtual PrivilegeVariadicType get_variadic_tensor(slot_id slot,
+                                                    Permissions priv,
+                                                    IsGrad is_grad) const = 0;
+
+  virtual Allocator get_allocator() const = 0;
+  virtual size_t get_device_idx() const = 0;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(ITaskArgumentAccessor);
+
+struct TaskArgumentAccessor {
+  template <typename T>
+  T const &get_argument(slot_id slot) const {
+    if constexpr (is_in_variant<T, OpArgRefTypeBacking>::value) {
+      return std::get<T>(this->ptr->get_op_arg_ref(slot));
+    } else if constexpr (is_in_variant<T, RuntimeArgRefTypeBacking>::value) {
+      return std::get<T>(this->ptr->get_runtime_arg(slot));
+    } else {
+      return this->ptr->get_concrete_arg(slot).get<T>();
+    }
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, IsGrad::NO));
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, IsGrad::YES));
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO));
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES));
+  }
+
+  Allocator get_allocator() const {
+    return this->ptr->get_allocator();
+  }
+
+  template <typename T, typename... Args>
+  static
+      typename std::enable_if<std::is_base_of<ITaskArgumentAccessor, T>::value,
+                              TaskArgumentAccessor>::type
+      create(Args &&...args) {
+    return TaskArgumentAccessor(
+        std::make_shared<T>(std::forward<Args>(args)...));
+  }
+
+private:
+  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> ptr)
+      : ptr(ptr) {}
+  std::shared_ptr<ITaskArgumentAccessor const> ptr;
+};
+
+using DeviceStates = std::variant<LinearPerDeviceState>;
+
+using TaskImplFunction = std::variant<
+    std::function<DeviceStates(TaskArgumentAccessor const &)>,
+    std::function<std::optional<float>(TaskArgumentAccessor const &)>>;
+
+template <task_id_t>
+TaskImplFunction get_task_impl();
+
+template <task_id_t>
+OpTaskSignature get_signature();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/src/tasks.h b/lib/local-execution/include/tasks.h
similarity index 95%
rename from lib/runtime/src/tasks.h
rename to lib/local-execution/include/tasks.h
index 0e07fa3f85..d8fdc93f39 100644
--- a/lib/runtime/src/tasks.h
+++ b/lib/local-execution/include/tasks.h
@@ -1,8 +1,9 @@
-#ifndef _FLEXFLOW_TASKS_H
-#define _FLEXFLOW_TASKS_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
 
-#include "utils/optional.h"
 #include <string>
+#include <variant>
+#include <optional>
 
 namespace FlexFlow {
 
@@ -170,9 +171,9 @@ template <typename F>
 void register_task(task_id_t,
                    std::string const &name,
                    F const &func,
-                   optional<F const &> cpu_func = nullopt);
+                   std::optional<F const &> cpu_func = std::nullopt);
 
-template <task_id_t>
+template <task_id_t id>
 void register_task();
 
 void register_tasks();
diff --git a/lib/runtime/src/task_spec/variadic_tensor_ref.h b/lib/local-execution/include/variadic_tensor_ref.h
similarity index 72%
rename from lib/runtime/src/task_spec/variadic_tensor_ref.h
rename to lib/local-execution/include/variadic_tensor_ref.h
index a9d1b54731..077c989c95 100644
--- a/lib/runtime/src/task_spec/variadic_tensor_ref.h
+++ b/lib/local-execution/include/variadic_tensor_ref.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
 
 #include "arg_ref.h"
 #include "op_tensor_spec.h"
diff --git a/lib/runtime/src/task_spec/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
similarity index 85%
rename from lib/runtime/src/task_spec/op_task_invocation.cc
rename to lib/local-execution/src/op_task_invocation.cc
index fbbfe47726..5342e46b42 100644
--- a/lib/runtime/src/task_spec/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -1,5 +1,4 @@
 #include "op_task_invocation.h"
-#include "task_argument_accessor.h"
 
 namespace FlexFlow {
 
@@ -39,9 +38,4 @@ std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
   return this->tensor_bindings;
 }
 
-std::unordered_map<slot_id, OpTaskBinding::ArgSpec> const &
-    OpTaskBinding::get_arg_bindings() const {
-  return this->arg_bindings;
-}
-
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
new file mode 100644
index 0000000000..bc3eaa12db
--- /dev/null
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -0,0 +1,81 @@
+#include "op_task_signature.h"
+
+namespace FlexFlow {
+
+// OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){};
+
+void OpTaskSignature::add_input_slot(slot_id name, SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_optional_input_slot(slot_id name,
+                                              SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_untrainable_input_slot(slot_id name,
+                                                 SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::INPUT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::UNTRAINABLE};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_optional_untrainable_input_slot(slot_id name,
+                                                          SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::INPUT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::OPTIONAL_UNTRAINABLE};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_output_slot(slot_id name, SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_bwd_necessary_output_slot(slot_id name,
+                                                    SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::OUTPUT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::NECESSARY};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_weight_slot(slot_id name, SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::WEIGHT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::NECESSARY};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_optional_weight_slot(slot_id name,
+                                               SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::set_arg_types(
+    std::unordered_map<slot_id, std::type_index> const &arg_type) {
+  this->task_arg_types = arg_type;
+}
+
+void OpTaskSignature::add_from_slot_spec(OpTensorSlotSpec const &spec) {
+  this->op_tensor_slots.insert(spec);
+}
+
+} // namespace FlexFlow
diff --git a/lib/runtime/src/permissions.cc b/lib/local-execution/src/permissions.cc
similarity index 67%
rename from lib/runtime/src/permissions.cc
rename to lib/local-execution/src/permissions.cc
index 2992780ae1..2843dd1b70 100644
--- a/lib/runtime/src/permissions.cc
+++ b/lib/local-execution/src/permissions.cc
@@ -3,36 +3,6 @@
 
 namespace FlexFlow {
 
-Legion::PrivilegeMode to_legion(Permissions p) {
-  switch (p) {
-    case Permissions::NONE:
-      return LEGION_NO_ACCESS;
-    case Permissions::RO:
-      return LEGION_READ_ONLY;
-    case Permissions::WO:
-      return LEGION_WRITE_ONLY;
-    case Permissions::RW:
-      return LEGION_READ_WRITE;
-    default:
-      throw mk_runtime_error("Unknown permission {}", static_cast<int>(p));
-  }
-}
-
-optional<Permissions> from_legion(Legion::PrivilegeMode p) {
-  switch (p) {
-    case LEGION_NO_ACCESS:
-      return Permissions::NONE;
-    case LEGION_READ_ONLY:
-      return Permissions::RO;
-    case LEGION_WRITE_ONLY:
-      return Permissions::WO;
-    case LEGION_READ_WRITE:
-      return Permissions::RW;
-    default:
-      return nullopt;
-  }
-}
-
 Permissions join(Permissions lhs, Permissions rhs) {
   if (lhs <= rhs) {
     return rhs;
diff --git a/lib/runtime/include/runtime/task_spec/concrete_arg.h b/lib/runtime/include/runtime/task_spec/concrete_arg.h
deleted file mode 100644
index 1d973eb81a..0000000000
--- a/lib/runtime/include/runtime/task_spec/concrete_arg.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H
-#define _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H
-
-#include "arg_type_runtime_tag.h"
-#include "utils/type_index.h"
-#include <memory>
-
-namespace FlexFlow {
-
-struct ConcreteArgSpec {
-public:
-  ConcreteArgSpec() = delete;
-
-  template <typename T>
-  T const &get() {
-    assert(this->type_tag.matches<T>());
-
-    return *(T const *)ptr.get();
-  }
-
-  ArgTypeRuntimeTag get_type_tag() const {
-    return this->type_tag;
-  }
-  size_t serialize(Legion::Serializer &) const;
-
-  template <typename T>
-  static ConcreteArgSpec create(T const &t) {
-    static_assert(is_serializable<T>::value, "Type must be serializable");
-
-    return ConcreteArgSpec(type_index<T>(),
-                           std::make_shared<T>(t),
-                           ArgTypeRuntimeTag::create<T>());
-  }
-
-private:
-  ConcreteArgSpec(std::type_index,
-                  std::shared_ptr<void const>,
-                  ArgTypeRuntimeTag const &);
-
-  ArgTypeRuntimeTag type_tag;
-  std::shared_ptr<void const *> ptr;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
deleted file mode 100644
index 9cc05b8252..0000000000
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ /dev/null
@@ -1,193 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H
-
-#include "accessor.h"
-#include "device_specific.h"
-#include "realm_allocator.h"
-#include "runtime/config.h"
-#include "utils/exception.h"
-#include "utils/stack_map.h"
-#include "utils/strong_typedef.h"
-#include <vector>
-
-namespace FlexFlow {
-
-struct region_idx_t : strong_typedef<region_idx_t, int> {
-  using strong_typedef::strong_typedef;
-};
-
-FF_TYPEDEF_HASHABLE(region_idx_t);
-FF_TYPEDEF_PRINTABLE(region_idx_t, "region_idx");
-
-using NonvariadicFormat = region_idx_t;
-using VariadicFormat = std::vector<NonvariadicFormat>;
-
-using TensorArgumentFormat = variant<NonvariadicFormat, VariadicFormat>;
-
-bool is_variadic(TensorArgumentFormat const &);
-VariadicFormat get_variadic_format(TensorArgumentFormat const &);
-NonvariadicFormat get_nonvariadic_format(TensorArgumentFormat const &);
-
-struct TaskArgumentFormat {
-  std::type_index type;
-  size_t start;
-  req<size_t> end;
-};
-FF_VISITABLE_STRUCT(TaskArgumentFormat, type, start, end);
-
-struct FutureArgumentFormat {
-  std::type_index type;
-  req<size_t> future_idx;
-};
-FF_VISITABLE_STRUCT(FutureArgumentFormat, type, future_idx);
-
-struct TaskArgumentsFormat {
-  TaskArgumentsFormat() = default;
-
-  stack_map<slot_id, TensorArgumentFormat, MAX_NUM_TASK_REGIONS> region_idxs;
-  stack_map<slot_id, TaskArgumentFormat, MAX_NUM_TASK_ARGUMENTS> args;
-  stack_map<slot_id, FutureArgumentFormat, MAX_NUM_TASK_ARGUMENTS> futures;
-  stack_map<region_idx_t, Legion::PrivilegeMode, MAX_NUM_TASK_REGIONS> regions;
-  stack_map<region_idx_t, DataType, MAX_NUM_TASK_REGIONS> data_types;
-
-  void insert(std::pair<slot_id, TaskArgumentFormat> const &);
-  void insert(std::pair<slot_id, FutureArgumentFormat> const &);
-
-  void insert(region_idx_t, Legion::PrivilegeMode, DataType);
-  void insert(slot_id, region_idx_t);
-  void insert(slot_id, std::vector<region_idx_t> const &);
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
-    TaskArgumentsFormat, region_idxs, args, futures, regions, data_types);
-
-Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &,
-                                     region_idx_t const &);
-Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &,
-                                     parallel_tensor_guid_t const &);
-Permissions get_permissions(TaskArgumentsFormat const &, region_idx_t const &);
-Permissions get_permissions(TaskArgumentsFormat const &,
-                            parallel_tensor_guid_t const &);
-region_idx_t get_region_idx(TaskArgumentsFormat const &,
-                            parallel_tensor_guid_t const &);
-DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &);
-
-struct TaskArgumentAccessor {
-  TaskArgumentAccessor(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime);
-
-  Allocator get_allocator() const {
-    return get_gpu_memory_allocator(this->task);
-  }
-
-  template <typename T>
-  T const &get_argument(slot_id slot) const {
-    NOT_IMPLEMENTED();
-    // TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
-    // std::type_index actual_type = arg_fmt.type;
-    // std::type_index requested_type = {typeid(T)};
-
-    // if (actual_type != requested_type) {
-    //   throw mk_runtime_error(
-    //       "Type mismatch in argument access (\"{}\" != \"{}\")",
-    //       actual_type.name(),
-    //       requested_type.name());
-    // }
-
-    // void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
-    // Legion::Deserializer dez(start_ptr, arg_fmt.start);
-
-    // return ff_task_deserialize<T>(dez);
-  }
-
-  template <typename T>
-  optional<T> get_optional_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  std::vector<T> get_variadic_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV>
-      get_generic_accessor(region_idx_t const &idx) const {
-    auto tensor_privs = get_permissions(this->args_fmt, idx);
-    if (tensor_privs != PRIV) {
-      throw mk_runtime_error(
-          "Privilege mismatch while accessing tensor: {} != {}",
-          tensor_privs,
-          PRIV);
-    }
-
-    return helperGetGenericTensorAccessor<PRIV>(
-        get_datatype(this->args_fmt, idx),
-        regions[idx.value()],
-        task->regions[idx.value()],
-        FID_DATA,
-        ctx,
-        runtime);
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
-    auto argument_format =
-        get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
-
-    return this->get_generic_accessor<PRIV>(argument_format);
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(slot_id slot) const {
-    std::vector<privilege_mode_to_accessor<PRIV>> result;
-
-    auto argument_format =
-        get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
-    for (NonvariadicFormat const &argument : argument_format) {
-      result.push_back(this->get_generic_accessor<PRIV>(argument));
-    }
-
-    return result;
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  T *unwrap(DeviceSpecific<T> const &arg) const {
-    return arg.get(this->get_device_idx());
-  }
-
-  template <typename T, typename... Args>
-  DeviceSpecific<T> create_device_specific(Args &&...args) const {
-    return DeviceSpecific<T>::create(this->get_device_idx(),
-                                     std::forward<Args>(args)...);
-  }
-
-  size_t get_device_idx() const {
-    NOT_IMPLEMENTED();
-  }
-
-private:
-  Legion::Task const *task;
-  std::vector<Legion::PhysicalRegion> const &regions;
-  Legion::Context ctx;
-  Legion::Runtime *runtime;
-  TaskArgumentsFormat const &args_fmt;
-};
-
-} // namespace FlexFlow
-
-#endif

From 905bdd1d1bf350e3da334c8b7eadaf664009f5ee Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 21 May 2024 19:16:43 -0700
Subject: [PATCH 09/24] Minor build issues

---
 lib/local-execution/include/runtime_arg_ref.h | 2 +-
 lib/utils/include/utils/type_index.h          | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/local-execution/include/runtime_arg_ref.h b/lib/local-execution/include/runtime_arg_ref.h
index 1493531dc3..078067150a 100644
--- a/lib/local-execution/include/runtime_arg_ref.h
+++ b/lib/local-execution/include/runtime_arg_ref.h
@@ -26,7 +26,7 @@ RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
   return {RuntimeArgRefType::FF_HANDLE};
 }
 
-RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_iteration_config() {
   return {RuntimeArgRefType::FF_ITERATION_CONFIG};
 }
 
diff --git a/lib/utils/include/utils/type_index.h b/lib/utils/include/utils/type_index.h
index 49e893faa0..134589e0aa 100644
--- a/lib/utils/include/utils/type_index.h
+++ b/lib/utils/include/utils/type_index.h
@@ -3,17 +3,18 @@
 
 #include "fmt.h"
 #include <typeindex>
+#include <typeinfo>
 
 namespace FlexFlow {
 
 template <typename T>
-std::type_index type_index() {
+std::type_index init_type_index() {
   return std::type_index(typeid(T));
 }
 
 template <typename T>
 bool matches(std::type_index idx) {
-  return idx == type_index<T>();
+  return idx == init_type_index<T>();
 }
 
 } // namespace FlexFlow

From 3a3684e691f2e459d30bbcda4d21eeb60777fc24 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 22 May 2024 06:55:26 -0700
Subject: [PATCH 10/24] Build op task spec

---
 .../include/kernels/element_unary_kernels.h   |  3 -
 lib/local-execution/include/op_arg_ref.h      |  8 +-
 .../include/op_task_invocation.h              | 33 +-------
 lib/local-execution/include/runtime_arg_ref.h | 14 +---
 lib/local-execution/include/tasks.h           |  2 +-
 .../include/variadic_tensor_ref.h             |  4 +-
 lib/local-execution/src/op_arg_ref.cc         | 14 ++++
 lib/local-execution/src/op_task_invocation.cc | 75 ++++++++++++++-----
 lib/local-execution/src/ops/reduce.cc         |  2 +-
 lib/local-execution/src/ops/replicate.cc      |  4 +-
 .../src}/runtime_arg_ref.cc                   |  4 +
 .../src/variadic_tensor_ref.cc                |  9 +++
 .../include/op-attrs/get_output_shapes.h      |  6 +-
 .../include/op-attrs/ops/element_unary.h      |  3 +
 lib/op-attrs/include/op-attrs/ops/linear.h    |  4 +
 lib/pcg/src/computation_graph.cc              | 36 +++++----
 16 files changed, 129 insertions(+), 92 deletions(-)
 create mode 100644 lib/local-execution/src/op_arg_ref.cc
 rename lib/{runtime/src/task_spec => local-execution/src}/runtime_arg_ref.cc (72%)
 create mode 100644 lib/local-execution/src/variadic_tensor_ref.cc

diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 17e0048c65..dedfbb01ef 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -9,9 +9,6 @@
 
 namespace FlexFlow {
 
-using ElementUnaryUnifiedAttrs =
-    std::variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;
-
 struct ElementUnaryPerDeviceState {
   ffTensorDescriptor_t inputTensor, outputTensor;
   req<ffActivationDescriptor_t> actiDesc;
diff --git a/lib/local-execution/include/op_arg_ref.h b/lib/local-execution/include/op_arg_ref.h
index 02b354b221..577ac7984a 100644
--- a/lib/local-execution/include/op_arg_ref.h
+++ b/lib/local-execution/include/op_arg_ref.h
@@ -15,13 +15,9 @@ using OpArgRef = ArgRef<OpArgRefType, T>;
 using OpArgRefSpec = ArgRefSpec<OpArgRefType>;
 
 template <typename T>
-OpArgRef<DeviceSpecific<T>> per_device_op_state() {
-  return {OpArgRefType::PER_DEVICE_OP_STATE};
-}
+OpArgRef<DeviceSpecific<T>> per_device_op_state();
 
-OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx) {
-  return {OpArgRefType::PARALLEL_TENSOR_SHAPE};
-}
+OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h
index 2079fabcbc..ba35383641 100644
--- a/lib/local-execution/include/op_task_invocation.h
+++ b/lib/local-execution/include/op_task_invocation.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H
-#define _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 
 #include "concrete_arg.h"
 #include "kernels/accessor.h"
@@ -108,34 +108,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskInvocation,
 OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd);
 OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
 
-bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) {
-  // tensors
-  auto tensor_bindings = inv.binding.get_tensor_bindings();
-  for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
-    slot_id name = op_tensor_slot_spec.name;
-    IsGrad is_grad = op_tensor_slot_spec.is_grad;
-    std::pair<slot_id, IsGrad> tensor_key = std::make_pair(name, is_grad);
-    OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key);
-    if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role ||
-        op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) {
-      return false;
-    }
-  }
-
-  // args
-  auto sig_arg_types = sig.get_arg_types();
-  OpArgSpecTypeAccessor type_accessor;
-  for (auto arg_binding : inv.binding.get_arg_bindings()) {
-    slot_id name = arg_binding.first;
-    OpArgSpec op_arg_spec = arg_binding.second;
-    std::type_index arg_type = sig_arg_types.at(name);
-    if (type_accessor(op_arg_spec) != arg_type) {
-      return false;
-    }
-  }
-
-  return true;
-}
+bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/runtime_arg_ref.h b/lib/local-execution/include/runtime_arg_ref.h
index 078067150a..05afa456cf 100644
--- a/lib/local-execution/include/runtime_arg_ref.h
+++ b/lib/local-execution/include/runtime_arg_ref.h
@@ -18,17 +18,9 @@ using RuntimeArgRef = ArgRef<RuntimeArgRefType, T>;
 
 using RuntimeArgRefSpec = ArgRefSpec<RuntimeArgRefType>;
 
-RuntimeArgRef<ProfilingSettings> profiling_settings() {
-  return {RuntimeArgRefType::PROFILING_SETTINGS};
-}
-
-RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
-  return {RuntimeArgRefType::FF_HANDLE};
-}
-
-RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_iteration_config() {
-  return {RuntimeArgRefType::FF_ITERATION_CONFIG};
-}
+RuntimeArgRef<ProfilingSettings> profiling_settings();
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> iteration_config();
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/tasks.h b/lib/local-execution/include/tasks.h
index d8fdc93f39..c78fefd4ea 100644
--- a/lib/local-execution/include/tasks.h
+++ b/lib/local-execution/include/tasks.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
 
+#include <optional>
 #include <string>
 #include <variant>
-#include <optional>
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/variadic_tensor_ref.h b/lib/local-execution/include/variadic_tensor_ref.h
index 077c989c95..091c55b0af 100644
--- a/lib/local-execution/include/variadic_tensor_ref.h
+++ b/lib/local-execution/include/variadic_tensor_ref.h
@@ -11,9 +11,7 @@ enum class VariadicTensorRefType { INPUT_TENSORS };
 template <typename T>
 using VariadicTensorRef = ArgRef<VariadicTensorRefType, T>;
 
-VariadicTensorRef<OpTensorSpec> get_input_tensors() {
-  return {VariadicTensorRefType::INPUT_TENSORS};
-}
+VariadicTensorRef<OpTensorSpec> get_input_tensors();
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc
new file mode 100644
index 0000000000..ba5d215cb5
--- /dev/null
+++ b/lib/local-execution/src/op_arg_ref.cc
@@ -0,0 +1,14 @@
+#include "op_arg_ref.h"
+
+namespace FlexFlow {
+
+template <typename T>
+OpArgRef<DeviceSpecific<T>> per_device_op_state() {
+  return {OpArgRefType::PER_DEVICE_OP_STATE};
+}
+
+OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx) {
+  return {OpArgRefType::PARALLEL_TENSOR_SHAPE};
+}
+
+}
\ No newline at end of file
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 5342e46b42..39e023e554 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -2,29 +2,21 @@
 
 namespace FlexFlow {
 
-OpTaskSignature get_signature(task_id_t const &) {
-  NOT_IMPLEMENTED();
+OpTensorSpec input_tensor(int idx,
+                          OpSlotOptions option = OpSlotOptions::NECESSARY) {
+  return {TensorRole::INPUT, option, idx};
 }
 
-OpTensorSpec::OpTensorSpec(TensorRole _role, int _idx)
-    : role(_role), idx(_idx) {}
-
-OpTensorSpec input_tensor(int idx) {
-  return {TensorRole::INPUT, idx};
-}
-
-OpTensorSpec output_tensor(int idx) {
-  return {TensorRole::OUTPUT, idx};
+OpTensorSpec output_tensor(int idx,
+                           OpSlotOptions option = OpSlotOptions::NECESSARY) {
+  return {TensorRole::OUTPUT, option, idx};
 }
 
-OpTensorSpec weight_tensor(int idx) {
-  return {TensorRole::WEIGHT, idx};
+OpTensorSpec weight_tensor(int idx,
+                           OpSlotOptions option = OpSlotOptions::NECESSARY) {
+  return {TensorRole::WEIGHT, option, idx};
 }
 
-// OpTaskBinding::OpTaskBinding() {
-//   this->serializer.reserve_bytes(sizeof(TaskArgumentFormat));
-// }
-
 void OpTaskBinding::bind(slot_id slot, OpTensorSpec const &tensor_spec) {
   this->tensor_bindings.insert({{slot, IsGrad::NO}, tensor_spec});
 }
@@ -38,4 +30,53 @@ std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
   return this->tensor_bindings;
 }
 
+std::unordered_map<slot_id, OpArgSpec> const &
+    OpTaskBinding::get_arg_bindings() const {
+  return this->arg_bindings;
+}
+
+OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) {
+  OpTaskBinding bwd;
+  bwd.bind_args_from_fwd(fwd);
+  bwd.bind_tensors_from_fwd(fwd);
+  for (auto const &[key, spec] : fwd.get_tensor_bindings()) {
+    OpSlotOptions slot_option = spec.slot_option;
+    if (slot_option != OpSlotOptions::UNTRAINABLE ||
+        slot_option != OpSlotOptions::OPTIONAL_UNTRAINABLE) {
+      slot_id slot = key.first;
+      bwd.bind_grad(slot, spec);
+    }
+  }
+  return bwd;
+}
+
+bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) {
+  // tensors
+  auto tensor_bindings = inv.binding.get_tensor_bindings();
+  for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
+    slot_id name = op_tensor_slot_spec.name;
+    IsGrad is_grad = op_tensor_slot_spec.is_grad;
+    std::pair<slot_id, IsGrad> tensor_key = std::make_pair(name, is_grad);
+    OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key);
+    if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role ||
+        op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) {
+      return false;
+    }
+  }
+
+  // args
+  auto sig_arg_types = sig.get_arg_types();
+  OpArgSpecTypeAccessor type_accessor;
+  for (auto arg_binding : inv.binding.get_arg_bindings()) {
+    slot_id name = arg_binding.first;
+    OpArgSpec op_arg_spec = arg_binding.second;
+    std::type_index arg_type = sig_arg_types.at(name);
+    if (type_accessor(op_arg_spec) != arg_type) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index d502a2b669..5228d15a61 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -20,7 +20,7 @@ enum Slots {
   HANDLE
 };
 
-OpTaskInvocation init(TransposeAttrs const &attrs) {
+OpTaskInvocation init(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(HANDLE, ff_handle());
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index fa13766d9e..7e8cbac8c1 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -63,8 +63,8 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
 
   return profile(backward_kernel,
diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc
similarity index 72%
rename from lib/runtime/src/task_spec/runtime_arg_ref.cc
rename to lib/local-execution/src/runtime_arg_ref.cc
index a0aa242ce6..a9d573bbb5 100644
--- a/lib/runtime/src/task_spec/runtime_arg_ref.cc
+++ b/lib/local-execution/src/runtime_arg_ref.cc
@@ -11,4 +11,8 @@ RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
   return {RuntimeArgRefType::FF_HANDLE};
 }
 
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> iteration_config() {
+  return {RuntimeArgRefType::FF_ITERATION_CONFIG};
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc
new file mode 100644
index 0000000000..e524f4d7a5
--- /dev/null
+++ b/lib/local-execution/src/variadic_tensor_ref.cc
@@ -0,0 +1,9 @@
+#include "variadic_tensor_ref.h"
+
+namespace FlexFlow {
+
+VariadicTensorRef<OpTensorSpec> get_input_tensors() {
+  return {VariadicTensorRefType::INPUT_TENSORS};
+}
+
+}
\ No newline at end of file
diff --git a/lib/op-attrs/include/op-attrs/get_output_shapes.h b/lib/op-attrs/include/op-attrs/get_output_shapes.h
index 6fb93aac91..496cfbb755 100644
--- a/lib/op-attrs/include/op-attrs/get_output_shapes.h
+++ b/lib/op-attrs/include/op-attrs/get_output_shapes.h
@@ -128,9 +128,7 @@ ParallelTensorShape get_output_shape(DropoutAttrs const &,
 ParallelTensorShape get_output_shape(ElementBinaryAttrs const &,
                                      ParallelTensorShape const &,
                                      ParallelTensorShape const &);
-ParallelTensorShape get_output_shape(ElementUnaryAttrs const &,
-                                     ParallelTensorShape const &);
-ParallelTensorShape get_output_shape(ElementScalarUnaryAttrs const &,
+ParallelTensorShape get_output_shape(ElementUnaryUnifiedAttrs const &,
                                      ParallelTensorShape const &);
 ParallelTensorShape get_output_shape(EmbeddingAttrs const &,
                                      ParallelTensorShape const &);
@@ -153,6 +151,8 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &,
                                      ParallelTensorShape const &);
 ParallelTensorShape get_output_shape(ReplicateAttrs const &,
                                      ParallelTensorShape const &);
+ParallelTensorShape get_output_shape(ReshapeAttrs const &,
+                                     ParallelTensorShape const &);
 ParallelTensorShape get_output_shape(ReverseAttrs const &,
                                      ParallelTensorShape const &);
 std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const &,
diff --git a/lib/op-attrs/include/op-attrs/ops/element_unary.h b/lib/op-attrs/include/op-attrs/ops/element_unary.h
index 5e19b81c8c..6a80094dfa 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_unary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_unary.h
@@ -21,6 +21,9 @@ struct ElementScalarUnaryAttrs {
 FF_VISITABLE_STRUCT(ElementScalarUnaryAttrs, op_type, scalar);
 CHECK_VALID_OP_ATTR(ElementScalarUnaryAttrs);
 
+using ElementUnaryUnifiedAttrs =
+    std::variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index a46df59282..3b57a959b8 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -36,7 +36,11 @@ CHECK_VALID_OP_ATTR(LinearAttrs);
 
 TensorShape get_weights_shape(LinearAttrs const &attrs,
                               TensorShape const &input);
+ParallelTensorShape get_weights_shape(LinearAttrs const &attrs,
+                                      ParallelTensorShape const &input);
 TensorShape get_bias_shape(LinearAttrs const &attrs, TensorShape const &input);
+ParallelTensorShape get_bias_shape(LinearAttrs const &attrs,
+                                   ParallelTensorShape const &input);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc
index d8a57311bf..8b7803c181 100644
--- a/lib/pcg/src/computation_graph.cc
+++ b/lib/pcg/src/computation_graph.cc
@@ -2,14 +2,16 @@
 
 namespace FlexFlow {
 
-std::vector<operator_guid_t> traverse_comp_graph(ComputationGraph const & comp_graph) {
+std::vector<operator_guid_t>
+    traverse_comp_graph(ComputationGraph const &comp_graph) {
   std::vector<Node> layers = get_topological_ordering(comp_graph.value());
   return transform(layers, [&](Node const &e) -> operator_guid_t {
     return operator_guid_t{e};
   });
 }
 
-std::vector<operator_guid_t> traverse_comp_graph_backwards(ComputationGraph const & comp_graph) {
+std::vector<operator_guid_t>
+    traverse_comp_graph_backwards(ComputationGraph const &comp_graph) {
   std::vector<Node> layers =
       reversed<std::vector<Node>>(get_topological_ordering(comp_graph.value()));
   return transform(layers, [&](Node const &e) -> operator_guid_t {
@@ -30,27 +32,30 @@ std::vector<tensor_guid_t>
   std::vector<MultiDiOutput> sorted_outputs(outputs.begin(), outputs.end());
   sort(sorted_outputs.begin(), sorted_outputs.end(), src_edge_comparator);
   return transform(sorted_outputs,
-                    [&](MultiDiOutput const &e) -> tensor_guid_t {
-                      return tensor_guid_t{e};
-                    });
+                   [&](MultiDiOutput const &e) -> tensor_guid_t {
+                     return tensor_guid_t{e};
+                   });
 }
 
-std::vector<tensor_guid_t> get_outgoing_tensors(ComputationGraph const & comp_graph,
- operator_guid_t n) {
+std::vector<tensor_guid_t>
+    get_outgoing_tensors(ComputationGraph const &comp_graph,
+                         operator_guid_t n) {
   return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value()));
 }
 
-std::vector<tensor_guid_t> get_incoming_tensors(ComputationGraph const & comp_graph, operator_guid_t n) {
+std::vector<tensor_guid_t>
+    get_incoming_tensors(ComputationGraph const &comp_graph,
+                         operator_guid_t n) {
   return sort_edge_set(get_incoming_edges(comp_graph.value(), n.value()));
 }
 
-operator_guid_t add_node(ComputationGraph & comp_graph, Layer const &layer) {
+operator_guid_t add_node(ComputationGraph &comp_graph, Layer const &layer) {
   Node added_node = comp_graph.value().add_node(layer);
   return operator_guid_t{added_node};
 }
 
-tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph,
-  operator_guid_t node,
+tensor_guid_t create_outgoing_edge_with_label(ComputationGraph &comp_graph,
+                                              operator_guid_t node,
                                               int idx,
                                               Tensor tensor) {
   MultiDiOutput edge = {node.value(), NodePort{idx}};
@@ -58,8 +63,8 @@ tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph,
   return tensor_guid_t{edge};
 }
 
-void add_incoming_edges(ComputationGraph & comp_graph, 
-std::vector<tensor_guid_t> const &incoming_edges,
+void add_incoming_edges(ComputationGraph &comp_graph,
+                        std::vector<tensor_guid_t> const &incoming_edges,
                         operator_guid_t node) {
   size_t incoming_edge_dst_port = 0;
   for (tensor_guid_t input : incoming_edges) {
@@ -72,8 +77,9 @@ std::vector<tensor_guid_t> const &incoming_edges,
   }
 }
 
-CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const & comp_graph, operator_guid_t const &n) {
+CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph,
+                                       operator_guid_t const &n) {
   return comp_graph.at(n).attrs;
 }
 
-}
\ No newline at end of file
+} // namespace FlexFlow

From a4dd9d4c2ac3a5381b4e60a7b36b45584d6cf7b7 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 22 May 2024 11:44:50 -0700
Subject: [PATCH 11/24] Build ops and op task spec

---
 .../include/op_task_invocation.h              | 14 +-----
 .../include/op_task_signature.h               | 11 ++---
 lib/local-execution/include/profiling.h       |  4 --
 lib/local-execution/src/op_arg_ref.cc         |  2 +-
 lib/local-execution/src/op_task_invocation.cc | 41 +++++++++++-----
 lib/local-execution/src/op_task_signature.cc  |  2 +-
 lib/local-execution/src/ops/attention.cc      |  6 +--
 lib/local-execution/src/ops/batch_matmul.cc   |  3 +-
 lib/local-execution/src/ops/batch_norm.cc     |  7 ++-
 lib/local-execution/src/ops/cast.cc           |  3 +-
 lib/local-execution/src/ops/combine.cc        |  3 +-
 lib/local-execution/src/ops/concat.cc         |  4 +-
 lib/local-execution/src/ops/conv_2d.cc        |  6 +--
 lib/local-execution/src/ops/dropout.cc        |  7 ++-
 lib/local-execution/src/ops/element_binary.cc |  6 +--
 lib/local-execution/src/ops/element_unary.cc  |  7 ++-
 lib/local-execution/src/ops/embedding.cc      |  3 +-
 lib/local-execution/src/ops/flat.cc           |  3 +-
 lib/local-execution/src/ops/layer_norm.cc     | 24 +++++-----
 lib/local-execution/src/ops/linear.cc         |  6 +--
 lib/local-execution/src/ops/partition.cc      |  6 +--
 lib/local-execution/src/ops/pool_2d.cc        |  6 +--
 lib/local-execution/src/ops/reduce.cc         |  6 +--
 lib/local-execution/src/ops/reduction.cc      |  3 +-
 lib/local-execution/src/ops/replicate.cc      |  5 +-
 lib/local-execution/src/ops/reshape.cc        |  6 +--
 lib/local-execution/src/ops/reverse.cc        |  3 +-
 lib/local-execution/src/ops/softmax.cc        |  6 +--
 lib/local-execution/src/ops/split.cc          | 48 ++++++++++---------
 lib/local-execution/src/ops/topk.cc           |  6 +--
 .../src/variadic_tensor_ref.cc                |  2 +-
 31 files changed, 113 insertions(+), 146 deletions(-)

diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h
index ba35383641..03cd19ed8e 100644
--- a/lib/local-execution/include/op_task_invocation.h
+++ b/lib/local-execution/include/op_task_invocation.h
@@ -25,14 +25,6 @@ enum class IsTrainable { YES, NO };
 using OpArgSpec =
     std::variant<ConcreteArgSpec, OpArgRefSpec, RuntimeArgRefSpec>;
 
-struct OpArgSpecTypeAccessor {
-  std::type_index operator()(OpArgSpec &spec) {
-    return std::visit(
-        [](auto &&arg) -> std::type_index { return arg.get_type_index(); },
-        spec);
-  }
-};
-
 struct OpTaskBinding {
   OpTaskBinding() = default;
 
@@ -101,14 +93,12 @@ struct OpTaskInvocation {
   task_id_t task_id;
   OpTaskBinding binding;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskInvocation,
-                                             task_id,
-                                             binding);
+FF_VISITABLE_STRUCT(OpTaskInvocation, task_id, binding);
 
 OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd);
 OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
 
-bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv);
+bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/op_task_signature.h
index 626266d10f..c4553df8a1 100644
--- a/lib/local-execution/include/op_task_signature.h
+++ b/lib/local-execution/include/op_task_signature.h
@@ -41,8 +41,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
     OpTensorSlotSpec, name, slot_type, tensor_role, is_grad, slot_option);
 
 struct OpTaskSignature {
-  OpTaskSignature() = default;
-  // explicit OpTaskSignature(OpTaskType);
+  explicit OpTaskSignature(OpTaskType);
 
   OpTaskType get_task_type() const {
     return this->type;
@@ -72,7 +71,6 @@ struct OpTaskSignature {
 
   template <typename T>
   void add_return_value() {
-    // std::type_index return_value = init_type_index<T>();
     this->return_value = init_type_index<T>();
   }
 
@@ -92,11 +90,8 @@ struct OpTaskSignature {
   std::unordered_map<slot_id, std::type_index> task_arg_types;
   std::unordered_set<OpTensorSlotSpec> op_tensor_slots;
 };
-// FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskSignature,
-//                                             type,
-//                                             return_value,
-//                                             task_arg_types,
-//                                             op_tensor_slots);
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    OpTaskSignature, type, return_value, task_arg_types, op_tensor_slots);
 
 template <typename F>
 void register_task(task_id_t,
diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h
index 066cdc8404..c4ac1b7d02 100644
--- a/lib/local-execution/include/profiling.h
+++ b/lib/local-execution/include/profiling.h
@@ -12,10 +12,6 @@ std::optional<float>
     profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) {
   std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
-  // TODO -- local logger?
-  // if (elapsed.has_value()) {
-  //   log_profile.debug(s, elapsed.value());
-  // }
   return elapsed;
 }
 
diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc
index ba5d215cb5..6bea26a5a2 100644
--- a/lib/local-execution/src/op_arg_ref.cc
+++ b/lib/local-execution/src/op_arg_ref.cc
@@ -11,4 +11,4 @@ OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx) {
   return {OpArgRefType::PARALLEL_TENSOR_SHAPE};
 }
 
-}
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 39e023e554..94504840c0 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -50,28 +50,38 @@ OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) {
   return bwd;
 }
 
-bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) {
-  // tensors
+bool is_op_tensor_spec_invalid(OpTensorSlotSpec tensor_slot_spec,
+                               OpTensorSpec tensor_spec) {
+  return tensor_spec.role != tensor_slot_spec.tensor_role ||
+         tensor_spec.slot_option != tensor_slot_spec.slot_option;
+}
+
+bool is_tensor_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
   auto tensor_bindings = inv.binding.get_tensor_bindings();
   for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
-    slot_id name = op_tensor_slot_spec.name;
-    IsGrad is_grad = op_tensor_slot_spec.is_grad;
-    std::pair<slot_id, IsGrad> tensor_key = std::make_pair(name, is_grad);
+    std::pair<slot_id, IsGrad> tensor_key =
+        std::make_pair(op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad);
     OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key);
-    if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role ||
-        op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) {
+    if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) {
       return false;
     }
   }
+  return true;
+}
 
-  // args
+bool is_arg_type_invalid(std::type_index expected_arg_type,
+                         OpArgSpec op_arg_spec) {
+  std::type_index arg_spec_type = std::visit(
+      [](auto &&arg) -> std::type_index { return arg.get_type_index(); },
+      op_arg_spec);
+  return arg_spec_type != expected_arg_type;
+}
+
+bool is_arg_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
   auto sig_arg_types = sig.get_arg_types();
-  OpArgSpecTypeAccessor type_accessor;
   for (auto arg_binding : inv.binding.get_arg_bindings()) {
-    slot_id name = arg_binding.first;
-    OpArgSpec op_arg_spec = arg_binding.second;
-    std::type_index arg_type = sig_arg_types.at(name);
-    if (type_accessor(op_arg_spec) != arg_type) {
+    std::type_index arg_type = sig_arg_types.at(arg_binding.first);
+    if (is_arg_type_invalid(arg_type, arg_binding.second)) {
       return false;
     }
   }
@@ -79,4 +89,9 @@ bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) {
   return true;
 }
 
+bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
+  return is_tensor_invocation_valid(sig, inv) &&
+         is_arg_invocation_valid(sig, inv);
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
index bc3eaa12db..71642680a6 100644
--- a/lib/local-execution/src/op_task_signature.cc
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 
-// OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){};
+OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){};
 
 void OpTaskSignature::add_input_slot(slot_id name, SlotType slot_type) {
   OpTensorSlotSpec op_tensor_slot_spec = {
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index 854213a955..414b71ec70 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -277,8 +277,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<ATTENTION_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
   init.add_arg_slot<ParallelTensorShape>(QUERY_PARALLEL_TENSOR_SHAPE);
   init.add_arg_slot<ParallelTensorShape>(KEY_PARALLEL_TENSOR_SHAPE);
   init.add_arg_slot<ParallelTensorShape>(VALUE_PARALLEL_TENSOR_SHAPE);
@@ -309,8 +308,7 @@ OpTaskSignature get_signature<ATTENTION_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<ATTENTION_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(QUERY);
   fwd.add_input_slot(KEY);
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index c5df564afd..eccbe5a475 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -187,8 +187,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<BATCHMATMUL_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(A_INPUT);
   fwd.add_input_slot(B_INPUT);
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index dadfab14e0..5e640d70e0 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -189,8 +189,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<BATCHNORM_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
+
   init.add_input_slot(INPUT);
   init.add_input_slot(BIAS);
   init.add_output_slot(OUTPUT);
@@ -211,8 +211,7 @@ void register_task<BATCHNORM_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<BATCHNORM_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_input_slot(SCALE);
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 0914ea40a6..5647d7e7f2 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -107,8 +107,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<CAST_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<CastAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
index 942d964021..0bce55722a 100644
--- a/lib/local-execution/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -84,8 +84,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<COMBINE_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUT);
diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
index 3d62c19f20..087f08b577 100644
--- a/lib/local-execution/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -111,8 +111,8 @@ CostMetrics
 
 template <>
 OpTaskSignature fwd_signature<CONCAT_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
+
   fwd.add_arg_slot<ConcatAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUTS, SlotType::VARIADIC);
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index 0df15e9b23..a53b259fac 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -178,8 +178,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<CONV2D_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_input_slot(INPUT);
   init.add_output_slot(OUTPUT);
@@ -202,8 +201,7 @@ void register_task<CONV2D_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<CONV2D_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_unchecked_arg_slot<Conv2DPerDeviceState>(PER_DEVICE_STATE);
diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
index 236b7e2c88..4935091ee5 100644
--- a/lib/local-execution/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -124,8 +124,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<DROPOUT_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
+
   init.add_arg_slot<DropoutAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(FF_HANDLE);
   init.add_output_slot(OUTPUT);
@@ -145,8 +145,7 @@ void register_task<DROPOUT_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<DROPOUT_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_unchecked_arg_slot<DropoutPerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc
index 0cec2b8d0a..b5588e04fd 100644
--- a/lib/local-execution/src/ops/element_binary.cc
+++ b/lib/local-execution/src/ops/element_binary.cc
@@ -173,8 +173,7 @@ CostMetrics
 
 template <>
 OpTaskSignature init_signature<ELEMENTBINARY_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_input_slot(LHS_INPUT);
   init.add_input_slot(RHS_INPUT);
@@ -197,8 +196,7 @@ void register_task<ELEMENTBINARY_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<ELEMENTBINARY_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index 9567fc1570..ddec57414a 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -150,8 +150,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature init_signature<ELEMENTUNARY_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
+
   init.add_arg_slot<ParallelTensorShape>(INPUT_SHAPE);
   init.add_arg_slot<ElementUnaryUnifiedAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
@@ -171,8 +171,7 @@ void register_task<ELEMENTUNARY_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<ELEMENTUNARY_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc
index 31dc83814f..bac48c4b24 100644
--- a/lib/local-execution/src/ops/embedding.cc
+++ b/lib/local-execution/src/ops/embedding.cc
@@ -120,8 +120,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<EMBED_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_input_slot(OUTPUT);
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 45d3805e0c..9849bd3b73 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -82,8 +82,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 
 template <>
 OpTaskSignature fwd_signature<FLAT_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_input_slot(INPUT);
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index 3caf95c068..83d04b893f 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -133,15 +133,16 @@ static DeviceSpecific<LayerNormPerDeviceState>
     num_replicas *= input.shape.at(legion_dim_t(i));
     effective_num_elements = M;
     effective_batch_size = input.shape.get_volume() / M;
-
-    DeviceSpecific<LayerNormPerDeviceState> per_device_state =
-        init_kernel(handle,
-                    allocator,
-                    attrs.elementwise_affine,
-                    effective_batch_size,
-                    effective_num_elements,
-                    attrs.eps);
   }
+
+  DeviceSpecific<LayerNormPerDeviceState> per_device_state =
+      init_kernel(handle,
+                  allocator,
+                  attrs.elementwise_affine,
+                  effective_batch_size,
+                  effective_num_elements,
+                  attrs.eps);
+  return per_device_state;
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -186,8 +187,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
@@ -208,8 +208,8 @@ OpTaskSignature bwd_signature<LAYERNORM_BWD_TASK_ID>() {
 
 template <>
 OpTaskSignature init_signature<LAYERNORM_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
+
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 2d13909c09..08e8fa3f68 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -211,8 +211,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 OpTaskSignature init_signature<LINEAR_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_input_slot(INPUT);
   init.add_weight_slot(WEIGHT);
@@ -227,8 +226,7 @@ OpTaskSignature init_signature<LINEAR_INIT_TASK_ID>() {
 
 template <>
 OpTaskSignature fwd_signature<LINEAR_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_weight_slot(WEIGHT);
diff --git a/lib/local-execution/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc
index c6e5bce64d..1d358b52f5 100644
--- a/lib/local-execution/src/ops/partition.cc
+++ b/lib/local-execution/src/ops/partition.cc
@@ -135,8 +135,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REPARTITION_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
@@ -150,8 +149,7 @@ void register_task<REPARTITION_INIT_TASK_ID>() {
 
 template <>
 void register_task<REPARTITION_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 32bc5d1616..576a5a8d23 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -182,8 +182,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<POOL2D_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_input_slot(INPUT);
   init.add_output_slot(OUTPUT);
@@ -198,8 +197,7 @@ void register_task<POOL2D_INIT_TASK_ID>() {
 
 template <>
 void register_task<POOL2D_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index 5228d15a61..0ccd7be6e3 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -49,8 +49,7 @@ static DeviceSpecific<ReducePerDeviceState>
 
 template <>
 void register_task<TRANSPOSE_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
@@ -92,8 +91,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
 template <>
 void register_task<REDUCE_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_unchecked_arg_slot<ReducePerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index 31b3e2458d..86f300df63 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -104,8 +104,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REDUCTION_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_arg_slot<ReductionAttrs>(ATTRS);
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 7e8cbac8c1..3322f8a1ce 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -72,7 +72,7 @@ static std::optional<float>
                  "[replicate] backward_time = %.2lfms\n",
                  input_grad,
                  output_grad,
-                 attrs.replicate_degree); // is this `num_replicas`?
+                 attrs.replicate_degree);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -100,8 +100,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REPLICATE_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUT);
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index 2b3200d79d..c53fe5d78b 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -126,8 +126,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<RESHAPE_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_arg_slot<ReshapeAttrs>(ATTRS);
 
@@ -138,8 +137,7 @@ void register_task<RESHAPE_INIT_TASK_ID>() {
 
 template <>
 void register_task<RESHAPE_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<ReshapePerDeviceState>(PER_DEVICE_STATE);
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 6c28966e6e..49f1e51076 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -133,8 +133,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<REVERSE_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_input_slot(INPUT);
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 054b3bc7db..5a65127140 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -138,8 +138,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<SOFTMAX_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<SoftmaxAttrs>(ATTRS);
@@ -150,8 +149,7 @@ void register_task<SOFTMAX_INIT_TASK_ID>() {
 
 template <>
 void register_task<SOFTMAX_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<SoftmaxPerDeviceState>(PER_DEVICE_STATE);
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index 3661d6e074..ffb40515ad 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -44,17 +44,17 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
   return {SPLIT_BWD_TASK_ID, binding};
 }
 
-void calc_block_size(coord_t &num_blks,
-                     coord_t &blk_size,
+void calc_block_size(coord_t &num_blocks,
+                     coord_t &block_size,
                      ArrayShape const &array_shape,
                      int axis) {
-  num_blks = 1;
-  blk_size = 1;
+  num_blocks = 1;
+  block_size = 1;
   for (int d = 0; d < array_shape.num_elements(); d++) {
     if (d <= axis) {
-      blk_size *= array_shape.at(legion_dim_t(d));
+      block_size *= array_shape.at(legion_dim_t(d));
     } else {
-      num_blks *= array_shape.at(legion_dim_t(d));
+      num_blocks *= array_shape.at(legion_dim_t(d));
     }
   }
 }
@@ -65,13 +65,13 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis.value());
+  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
+  calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis.value());
 
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blks;
+    coord_t out_num_blocks;
     calc_block_size(
-        out_num_blks, out_blk_size[i], output.shape, attrs.axis.value());
+        out_num_blocks, out_block_size[i], output.shape, attrs.axis.value());
   }
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
@@ -79,9 +79,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "Split forward_time = %.2lfms\n",
                  &output_float_ptr,
                  input.get_float_ptr(),
-                 out_blk_size,
-                 in_blk_size,
-                 num_blks,
+                 out_block_size,
+                 in_block_size,
+                 num_blocks,
                  attrs.splits.size());
 }
 
@@ -93,12 +93,15 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blks, in_blk_size, input_grad.shape, attrs.axis.value());
+  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
+  calc_block_size(
+      num_blocks, in_block_size, input_grad.shape, attrs.axis.value());
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blks;
-    calc_block_size(
-        out_num_blks, out_blk_size[i], output_grad.shape, attrs.axis.value());
+    coord_t out_num_blocks;
+    calc_block_size(out_num_blocks,
+                    out_block_size[i],
+                    output_grad.shape,
+                    attrs.axis.value());
   }
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
@@ -106,9 +109,9 @@ static std::optional<float>
                  "Split backward_time = %.2lfms\n",
                  input_grad.get_float_ptr(),
                  &output_grad_ptr,
-                 out_blk_size,
-                 in_blk_size,
-                 num_blks,
+                 out_block_size,
+                 in_block_size,
+                 num_blocks,
                  attrs.splits.size());
 }
 
@@ -143,8 +146,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<SPLIT_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
index 5fb2c6842f..f6783a2d6c 100644
--- a/lib/local-execution/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -157,8 +157,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
 template <>
 void register_task<TOPK_INIT_TASK_ID>() {
-  OpTaskSignature init;
-  init.type = OpTaskType::INIT;
+  OpTaskSignature init(OpTaskType::INIT);
 
   init.add_arg_slot<TopKAttrs>(ATTRS); // Note: this may have some question
   init.add_return_value<TopKPerDeviceState>();
@@ -167,8 +166,7 @@ void register_task<TOPK_INIT_TASK_ID>() {
 
 template <>
 void register_task<TOPK_FWD_TASK_ID>() {
-  OpTaskSignature fwd;
-  fwd.type = OpTaskType::FWD;
+  OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_arg_slot<TopKAttrs>(ATTRS);
diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc
index e524f4d7a5..74d0f0d9e7 100644
--- a/lib/local-execution/src/variadic_tensor_ref.cc
+++ b/lib/local-execution/src/variadic_tensor_ref.cc
@@ -6,4 +6,4 @@ VariadicTensorRef<OpTensorSpec> get_input_tensors() {
   return {VariadicTensorRefType::INPUT_TENSORS};
 }
 
-}
\ No newline at end of file
+} // namespace FlexFlow

From 5bc719f893bf65252e8d56f3677f4742b6477e15 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 22 May 2024 12:52:50 -0700
Subject: [PATCH 12/24] Simplify edge set obtain

---
 lib/pcg/include/pcg/computation_graph.h  | 22 +++++-----
 lib/pcg/src/computation_graph.cc         | 55 +++++++++++-------------
 lib/pcg/src/computation_graph_builder.cc | 19 ++++----
 3 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 53aa7eb820..c051fcc8c3 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -36,22 +36,22 @@ struct ComputationGraph
 CHECK_WELL_BEHAVED_VALUE_TYPE_NO_HASH(ComputationGraph);
 
 std::vector<operator_guid_t>
-    traverse_comp_graph(ComputationGraph const &comp_graph);
+    traverse_comp_graph_forward(ComputationGraph const &comp_graph);
 std::vector<operator_guid_t>
-    traverse_comp_graph_backwards(ComputationGraph const &comp_graph);
+    traverse_comp_graph_backward(ComputationGraph const &comp_graph);
 std::vector<tensor_guid_t>
     get_outgoing_tensors(ComputationGraph const &comp_graph, operator_guid_t n);
 std::vector<tensor_guid_t>
     get_incoming_tensors(ComputationGraph const &comp_graph, operator_guid_t n);
-operator_guid_t add_node(ComputationGraph &comp_graph, Layer const &layer);
-tensor_guid_t create_outgoing_edge_with_label(ComputationGraph &comp_graph,
-                                              operator_guid_t node,
-                                              int idx,
-                                              Tensor tensor);
-
-void add_incoming_edges(ComputationGraph &comp_graph,
-                        std::vector<tensor_guid_t> const &incoming_edges,
-                        operator_guid_t node);
+operator_guid_t create_node(ComputationGraph &comp_graph, Layer const &layer);
+tensor_guid_t create_outgoing_edge(ComputationGraph &comp_graph,
+                                   operator_guid_t node,
+                                   int idx,
+                                   Tensor tensor);
+
+void connect_incoming_edges(ComputationGraph &comp_graph,
+                            std::vector<tensor_guid_t> const &incoming_edges,
+                            operator_guid_t node);
 CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph,
                                        operator_guid_t const &n);
 
diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc
index d8a57311bf..18fded6d3e 100644
--- a/lib/pcg/src/computation_graph.cc
+++ b/lib/pcg/src/computation_graph.cc
@@ -2,14 +2,16 @@
 
 namespace FlexFlow {
 
-std::vector<operator_guid_t> traverse_comp_graph(ComputationGraph const & comp_graph) {
+std::vector<operator_guid_t>
+    traverse_comp_graph_forward(ComputationGraph const &comp_graph) {
   std::vector<Node> layers = get_topological_ordering(comp_graph.value());
   return transform(layers, [&](Node const &e) -> operator_guid_t {
     return operator_guid_t{e};
   });
 }
 
-std::vector<operator_guid_t> traverse_comp_graph_backwards(ComputationGraph const & comp_graph) {
+std::vector<operator_guid_t>
+    traverse_comp_graph_backward(ComputationGraph const &comp_graph) {
   std::vector<Node> layers =
       reversed<std::vector<Node>>(get_topological_ordering(comp_graph.value()));
   return transform(layers, [&](Node const &e) -> operator_guid_t {
@@ -17,50 +19,44 @@ std::vector<operator_guid_t> traverse_comp_graph_backwards(ComputationGraph cons
   });
 }
 
-bool src_edge_comparator(MultiDiOutput x, MultiDiOutput y) {
-  return x.src_idx < y.src_idx;
-}
-
 std::vector<tensor_guid_t>
     sort_edge_set(std::unordered_set<MultiDiEdge> edges) {
-  std::unordered_set<MultiDiOutput> outputs =
-      transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput {
-        return MultiDiOutput(e);
-      });
-  std::vector<MultiDiOutput> sorted_outputs(outputs.begin(), outputs.end());
-  sort(sorted_outputs.begin(), sorted_outputs.end(), src_edge_comparator);
-  return transform(sorted_outputs,
-                    [&](MultiDiOutput const &e) -> tensor_guid_t {
-                      return tensor_guid_t{e};
-                    });
+  return transform(
+      sorted_by(edges, compare_by<MultiDiEdge>([](MultiDiEdge const &e) {
+                  return e.src_idx;
+                })),
+      [&](MultiDiEdge const &e) -> tensor_guid_t { return tensor_guid_t{e}; });
 }
 
-std::vector<tensor_guid_t> get_outgoing_tensors(ComputationGraph const & comp_graph,
- operator_guid_t n) {
+std::vector<tensor_guid_t>
+    get_outgoing_tensors(ComputationGraph const &comp_graph,
+                         operator_guid_t n) {
   return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value()));
 }
 
-std::vector<tensor_guid_t> get_incoming_tensors(ComputationGraph const & comp_graph, operator_guid_t n) {
+std::vector<tensor_guid_t>
+    get_incoming_tensors(ComputationGraph const &comp_graph,
+                         operator_guid_t n) {
   return sort_edge_set(get_incoming_edges(comp_graph.value(), n.value()));
 }
 
-operator_guid_t add_node(ComputationGraph & comp_graph, Layer const &layer) {
+operator_guid_t create_node(ComputationGraph &comp_graph, Layer const &layer) {
   Node added_node = comp_graph.value().add_node(layer);
   return operator_guid_t{added_node};
 }
 
-tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph,
-  operator_guid_t node,
-                                              int idx,
-                                              Tensor tensor) {
+tensor_guid_t create_outgoing_edge(ComputationGraph &comp_graph,
+                                   operator_guid_t node,
+                                   int idx,
+                                   Tensor tensor) {
   MultiDiOutput edge = {node.value(), NodePort{idx}};
   comp_graph.value().add_output(edge, tensor);
   return tensor_guid_t{edge};
 }
 
-void add_incoming_edges(ComputationGraph & comp_graph, 
-std::vector<tensor_guid_t> const &incoming_edges,
-                        operator_guid_t node) {
+void connect_incoming_edges(ComputationGraph &comp_graph,
+                            std::vector<tensor_guid_t> const &incoming_edges,
+                            operator_guid_t node) {
   size_t incoming_edge_dst_port = 0;
   for (tensor_guid_t input : incoming_edges) {
     MultiDiOutput input_view = input.value();
@@ -72,8 +68,9 @@ std::vector<tensor_guid_t> const &incoming_edges,
   }
 }
 
-CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const & comp_graph, operator_guid_t const &n) {
+CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph,
+                                       operator_guid_t const &n) {
   return comp_graph.at(n).attrs;
 }
 
-}
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc
index 78e49f0695..f237232a76 100644
--- a/lib/pcg/src/computation_graph_builder.cc
+++ b/lib/pcg/src/computation_graph_builder.cc
@@ -12,13 +12,12 @@ tensor_guid_t ComputationGraphBuilder::add_layer(
     std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
     TensorShape const &output_shape) {
-  operator_guid_t node = add_node(computation_graph, layer);
-  add_incoming_edges(computation_graph, inputs, node);
-  return create_outgoing_edge_with_label(
-      computation_graph,
-      node,
-      0,
-      construct_tensor_from_output_shape(output_shape));
+  operator_guid_t node = create_node(computation_graph, layer);
+  connect_incoming_edges(computation_graph, inputs, node);
+  return create_outgoing_edge(computation_graph,
+                              node,
+                              0,
+                              construct_tensor_from_output_shape(output_shape));
 }
 
 std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
@@ -27,11 +26,11 @@ std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
     std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
     std::vector<TensorShape> const &output_shapes) {
-  operator_guid_t node = add_node(computation_graph, layer);
-  add_incoming_edges(computation_graph, inputs, node);
+  operator_guid_t node = create_node(computation_graph, layer);
+  connect_incoming_edges(computation_graph, inputs, node);
   std::vector<tensor_guid_t> output_tensor_guids;
   for (int i = 0; i < output_shapes.size(); ++i) {
-    output_tensor_guids.push_back(create_outgoing_edge_with_label(
+    output_tensor_guids.push_back(create_outgoing_edge(
         computation_graph,
         node,
         i,

From 583b2d30ca780c9ac2e6fbcc391869f33b2ab2b9 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 22 May 2024 13:02:05 -0700
Subject: [PATCH 13/24] Format

---
 lib/pcg/src/computation_graph.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc
index 1508ad2d41..18fded6d3e 100644
--- a/lib/pcg/src/computation_graph.cc
+++ b/lib/pcg/src/computation_graph.cc
@@ -29,7 +29,7 @@ std::vector<tensor_guid_t>
 }
 
 std::vector<tensor_guid_t>
-   get_outgoing_tensors(ComputationGraph const &comp_graph,
+    get_outgoing_tensors(ComputationGraph const &comp_graph,
                          operator_guid_t n) {
   return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value()));
 }

From 269557e1fb43a1a5c66dacdbdcc771eef9dd04b4 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Mon, 27 May 2024 22:08:34 -0700
Subject: [PATCH 14/24] Fixes

---
 .../include/op_task_invocation.h                |  9 ---------
 lib/local-execution/include/op_task_signature.h |  5 +++--
 lib/local-execution/src/op_task_invocation.cc   | 17 ++++++++++++-----
 lib/local-execution/src/ops/layer_norm.cc       | 13 ++++---------
 4 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h
index 03cd19ed8e..4c753ec43c 100644
--- a/lib/local-execution/include/op_task_invocation.h
+++ b/lib/local-execution/include/op_task_invocation.h
@@ -8,7 +8,6 @@
 #include "op_tensor_spec.h"
 #include "profiling.h"
 #include "runtime_arg_ref.h"
-#include "serialization.h"
 #include "tasks.h"
 #include "utils/bidict.h"
 #include "utils/stack_map.h"
@@ -59,14 +58,6 @@ struct OpTaskBinding {
     this->insert_arg_spec(name, OpArgRefSpec::create(ref));
   }
 
-  void bind_args_from_fwd(OpTaskBinding const &fwd) {
-    this->arg_bindings = fwd.get_arg_bindings();
-  }
-
-  void bind_tensors_from_fwd(OpTaskBinding const &fwd) {
-    this->tensor_bindings = fwd.get_tensor_bindings();
-  }
-
   std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id, OpArgSpec> const &get_arg_bindings() const;
diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/op_task_signature.h
index c4553df8a1..191c83d287 100644
--- a/lib/local-execution/include/op_task_signature.h
+++ b/lib/local-execution/include/op_task_signature.h
@@ -41,6 +41,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
     OpTensorSlotSpec, name, slot_type, tensor_role, is_grad, slot_option);
 
 struct OpTaskSignature {
+  OpTaskSignature() = delete;
   explicit OpTaskSignature(OpTaskType);
 
   OpTaskType get_task_type() const {
@@ -81,9 +82,9 @@ struct OpTaskSignature {
     this->task_arg_types.insert({name, init_type_index<T>()});
   }
 
-  std::unordered_set<OpTensorSlotSpec> get_tensor_slots();
+  std::unordered_set<OpTensorSlotSpec> get_tensor_slots() const;
   void set_arg_types(std::unordered_map<slot_id, std::type_index> const &);
-  std::unordered_map<slot_id, std::type_index> get_arg_types();
+  std::unordered_map<slot_id, std::type_index> get_arg_types() const;
 
   OpTaskType type;
   std::optional<std::type_index> return_value;
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 94504840c0..31fc0b2da2 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -37,8 +37,8 @@ std::unordered_map<slot_id, OpArgSpec> const &
 
 OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) {
   OpTaskBinding bwd;
-  bwd.bind_args_from_fwd(fwd);
-  bwd.bind_tensors_from_fwd(fwd);
+  bwd.arg_bindings = fwd.get_arg_bindings();
+  bwd.tensor_bindings = fwd.get_tensor_bindings();
   for (auto const &[key, spec] : fwd.get_tensor_bindings()) {
     OpSlotOptions slot_option = spec.slot_option;
     if (slot_option != OpSlotOptions::UNTRAINABLE ||
@@ -56,7 +56,8 @@ bool is_op_tensor_spec_invalid(OpTensorSlotSpec tensor_slot_spec,
          tensor_spec.slot_option != tensor_slot_spec.slot_option;
 }
 
-bool is_tensor_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
+bool is_tensor_invocation_valid(OpTaskSignature const &sig,
+                                OpTaskInvocation const &inv) {
   auto tensor_bindings = inv.binding.get_tensor_bindings();
   for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
     std::pair<slot_id, IsGrad> tensor_key =
@@ -77,7 +78,8 @@ bool is_arg_type_invalid(std::type_index expected_arg_type,
   return arg_spec_type != expected_arg_type;
 }
 
-bool is_arg_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
+bool is_arg_invocation_valid(OpTaskSignature const &sig,
+                             OpTaskInvocation const &inv) {
   auto sig_arg_types = sig.get_arg_types();
   for (auto arg_binding : inv.binding.get_arg_bindings()) {
     std::type_index arg_type = sig_arg_types.at(arg_binding.first);
@@ -89,9 +91,14 @@ bool is_arg_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
   return true;
 }
 
-bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) {
+bool is_invocation_valid(OpTaskSignature const &sig,
+                         OpTaskInvocation const &inv) {
   return is_tensor_invocation_valid(sig, inv) &&
          is_arg_invocation_valid(sig, inv);
 }
 
+bool are_sigs_eq(OpTaskSignature const &sig1, OpTaskSignature const &sig2) {
+  return sig1 == sig2;
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index 83d04b893f..fb97f946eb 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -29,13 +29,9 @@ using namespace FlexFlow::Kernels::LayerNorm;
 enum Slots {
   PROFILING,
   INPUT,
-  INPUT_GRAD,
   OUTPUT,
-  OUTPUT_GRAD,
   GAMMA,
-  GAMMA_GRAD,
   BETA,
-  BETA_GRAD,
   PER_DEVICE_STATE,
   ATTRS,
   HANDLE
@@ -95,10 +91,10 @@ static std::optional<float>
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto gamma = acc.get_tensor<Permissions::RO>(GAMMA);
 
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT_GRAD);
-  auto gamma_grad = acc.get_tensor<Permissions::RW>(GAMMA_GRAD);
-  auto beta_grad = acc.get_tensor<Permissions::RW>(BETA_GRAD);
-  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT_GRAD);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
+  auto gamma_grad = acc.get_tensor_grad<Permissions::RW>(GAMMA);
+  auto beta_grad = acc.get_tensor_grad<Permissions::RW>(BETA);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
@@ -170,7 +166,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   fwd_binding.bind_arg(PROFILING, settings);
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
-  // TODO how to handle gamma and beta, where are they from
   fwd_binding.bind(GAMMA, input.shape);
   fwd_binding.bind(BETA, input.shape);
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);

From 269770a43214ecddb796210601a3bd4e1b35271e Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 29 May 2024 22:24:35 -0700
Subject: [PATCH 15/24] Fix conflicts, some renaming

---
 lib/local-execution/CMakeLists.txt               |  1 +
 lib/local-execution/include/arg_ref.h            |  4 ++--
 lib/local-execution/include/concrete_arg.h       |  2 +-
 lib/local-execution/include/local_allocator.h    |  5 -----
 lib/local-execution/include/op_task_invocation.h |  3 ++-
 lib/local-execution/include/op_task_signature.h  |  6 +++---
 lib/local-execution/include/profiling.h          |  4 ++++
 lib/local-execution/src/op_task_invocation.cc    |  4 ----
 lib/local-execution/src/tracked_allocator.cc     | 15 ---------------
 lib/utils/include/utils/type_index.h             |  4 ++--
 10 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt
index ee1d8fecdc..6b432fad75 100644
--- a/lib/local-execution/CMakeLists.txt
+++ b/lib/local-execution/CMakeLists.txt
@@ -12,4 +12,5 @@ ff_add_library(
     utils
     kernels
     pcg
+    spdlog
 )
\ No newline at end of file
diff --git a/lib/local-execution/include/arg_ref.h b/lib/local-execution/include/arg_ref.h
index 67e8a47404..b0e2b57b05 100644
--- a/lib/local-execution/include/arg_ref.h
+++ b/lib/local-execution/include/arg_ref.h
@@ -42,13 +42,13 @@ struct ArgRefSpec {
   static ArgRefSpec create(ArgRef<LABEL_TYPE, T> const &r) {
     static_assert(is_serializable<T>::value, "Type must be serializeable");
 
-    return ArgRefSpec(init_type_index<T>(), r.ref_type);
+    return ArgRefSpec(get_type_index_for_type<T>(), r.ref_type);
   }
 
   template <typename T>
   static ArgRefSpec create_device_specific(ArgRef<LABEL_TYPE, T> const &r,
                                            size_t device_idx) {
-    return ArgRefSpec(init_type_index<T>(), r.ref_type, device_idx);
+    return ArgRefSpec(get_type_index_for_type<T>(), r.ref_type, device_idx);
   }
 
 private:
diff --git a/lib/local-execution/include/concrete_arg.h b/lib/local-execution/include/concrete_arg.h
index 522d21485e..072500f47e 100644
--- a/lib/local-execution/include/concrete_arg.h
+++ b/lib/local-execution/include/concrete_arg.h
@@ -31,7 +31,7 @@ struct ConcreteArgSpec {
   static ConcreteArgSpec create(T const &t) {
     static_assert(is_serializable<T>::value, "Type must be serializable");
 
-    std::type_index type_idx = init_type_index<T>();
+    std::type_index type_idx = get_type_index_for_type<T>();
     std::shared_ptr<void const> ptr =
         std::static_pointer_cast<void const>(std::make_shared<T>(t));
 
diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local_allocator.h
index 85e54d09f5..b47220eb8c 100644
--- a/lib/local-execution/include/local_allocator.h
+++ b/lib/local-execution/include/local_allocator.h
@@ -1,10 +1,5 @@
-<<<<<<< op-refactor
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H
-=======
-#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
-#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
->>>>>>> repo-refactor
 
 #include "kernels/allocation.h"
 #include <unordered_set>
diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h
index 4c753ec43c..1bf94a1b0d 100644
--- a/lib/local-execution/include/op_task_invocation.h
+++ b/lib/local-execution/include/op_task_invocation.h
@@ -89,7 +89,8 @@ FF_VISITABLE_STRUCT(OpTaskInvocation, task_id, binding);
 OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd);
 OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
 
-bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv);
+bool is_invocation_valid(OpTaskSignature const &sig,
+                         OpTaskInvocation const &inv);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/op_task_signature.h
index 191c83d287..840c321627 100644
--- a/lib/local-execution/include/op_task_signature.h
+++ b/lib/local-execution/include/op_task_signature.h
@@ -67,19 +67,19 @@ struct OpTaskSignature {
   template <typename T>
   void add_arg_slot(slot_id name) {
     static_assert(is_serializable<T>::value, "Type must be serializable");
-    this->task_arg_types.insert({name, init_type_index<T>()});
+    this->task_arg_types.insert({name, get_type_index_for_type<T>()});
   }
 
   template <typename T>
   void add_return_value() {
-    this->return_value = init_type_index<T>();
+    this->return_value = get_type_index_for_type<T>();
   }
 
   // adds arg_slot without checking is_serializable, used for arguments that are
   // deviceSpecific
   template <typename T>
   void add_unchecked_arg_slot(slot_id name) {
-    this->task_arg_types.insert({name, init_type_index<T>()});
+    this->task_arg_types.insert({name, get_type_index_for_type<T>()});
   }
 
   std::unordered_set<OpTensorSlotSpec> get_tensor_slots() const;
diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h
index c4ac1b7d02..f3c0e36cc1 100644
--- a/lib/local-execution/include/profiling.h
+++ b/lib/local-execution/include/profiling.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H
 
 #include "kernels/profiling.h"
+#include "spdlog/spdlog.h"
 
 namespace FlexFlow {
 
@@ -12,6 +13,9 @@ std::optional<float>
     profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) {
   std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
+  if (elapsed.has_value()) {
+    spdlog::debug(elapsed.value());
+  }
   return elapsed;
 }
 
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 31fc0b2da2..5683cb12ec 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -97,8 +97,4 @@ bool is_invocation_valid(OpTaskSignature const &sig,
          is_arg_invocation_valid(sig, inv);
 }
 
-bool are_sigs_eq(OpTaskSignature const &sig1, OpTaskSignature const &sig2) {
-  return sig1 == sig2;
-}
-
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index 6fc2412836..6d06714252 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -3,16 +3,10 @@
 
 namespace FlexFlow {
 
-<<<<<<< op-refactor
-void *TrackedAllocator::allocate(size_t requested_memory_size) {
-  void *ptr;
-  checkCUDA(cudaMalloc(&ptr, requested_memory_size));
-=======
 TrackedAllocator::TrackedAllocator(Allocator a) : allocator(a) {}
 
 void *TrackedAllocator::allocate(size_t requested_memory_size) {
   void *ptr = this->allocator.allocate(requested_memory_size);
->>>>>>> repo-refactor
   this->current_mem_usage += requested_memory_size;
   return ptr;
 }
@@ -20,11 +14,7 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) {
 void TrackedAllocator::deallocate(void *ptr) {
   size_t psize;
   checkCUDA(cudaGetSymbolSize(&psize, ptr));
-<<<<<<< op-refactor
-  checkCUDA(cudaFree(ptr));
-=======
   this->allocator.deallocate(ptr);
->>>>>>> repo-refactor
   this->current_mem_usage -= psize;
 }
 
@@ -32,13 +22,8 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
-<<<<<<< op-refactor
-Allocator get_tracked_memory_allocator() {
-  return Allocator::create<TrackedAllocator>();
-=======
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
   return Allocator::create<TrackedAllocator>(base_allocator);
->>>>>>> repo-refactor
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/include/utils/type_index.h b/lib/utils/include/utils/type_index.h
index 134589e0aa..77a377a48d 100644
--- a/lib/utils/include/utils/type_index.h
+++ b/lib/utils/include/utils/type_index.h
@@ -8,13 +8,13 @@
 namespace FlexFlow {
 
 template <typename T>
-std::type_index init_type_index() {
+std::type_index get_type_index_for_type() {
   return std::type_index(typeid(T));
 }
 
 template <typename T>
 bool matches(std::type_index idx) {
-  return idx == init_type_index<T>();
+  return idx == get_type_index_for_type<T>();
 }
 
 } // namespace FlexFlow

From 2fbf2911e9e3bb51e9138231c770594384d86aa3 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 30 May 2024 15:49:22 -0700
Subject: [PATCH 16/24] Fix gather kernels

---
 lib/kernels/include/kernels/gather_kernels.h |  22 ++-
 lib/kernels/src/cuda/ops/gather_kernels.cu   | 177 +++++++++----------
 lib/local-execution/include/profiling.h      |   2 +-
 3 files changed, 98 insertions(+), 103 deletions(-)

diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index c74f9c0bb6..305ccc8e26 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -2,36 +2,34 @@
 #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H
 
 #include "accessor.h"
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
 struct GatherPerDeviceState {
+  PerDeviceFFHandle handle;
   int legion_dim;
-  req<DataType> index_data_type;
 };
+
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
-                                             legion_dim,
-                                             index_data_type);
+                                             handle,
+                                             legion_dim);
 
 namespace Kernels {
 namespace Gather {
+
 void forward_kernel(ffStream_t stream,
                     GatherPerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
-                    GenericTensorAccessorW const &output,
-                    size_t stride,
-                    size_t input_dim_size,
-                    size_t output_dim_size);
+                    GenericTensorAccessorW const &output);
+
 void backward_kernel(ffStream_t stream,
                      GatherPerDeviceState const &m,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
-                     GenericTensorAccessorW const &input_grad,
-                     size_t stride,
-                     size_t input_dim_size,
-                     size_t output_dim_size);
+                     GenericTensorAccessorW const &input_grad);
+
 } // namespace Gather
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 37d0112eab..829c952a49 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -25,25 +25,25 @@ template <typename IndexType>
 __global__ void gather_forward(float const *input,
                                IndexType const *index,
                                float *output,
-                               size_t output_size,
-                               size_t stride,
-                               size_t input_dim_size,
-                               size_t output_dim_size) {
+                               coord_t output_size,
+                               coord_t stride,
+                               coord_t input_dim_size,
+                               coord_t output_dim_size) {
   CUDA_KERNEL_LOOP(o, output_size) {
     // output tensor shape: [*, output_dim_size, stride]
     // output tensor stride: [output_dim_size * stride, stride, 1]
-    // output tensor index: [outer_index, index_2, left_over]
+    // output tensor index: [outter_index, index_2, left_over]
     // input tensor shape: [*, input_dim_size, stride]
     // input tensor stride: [input_dim_size * stride, stride, 1]
     // the index of the corresponding input tensor should be:
-    // [outer_index, index[0], left_over]
-    // Therefore, input_index = outer_index * (stride * input_dim_size)
+    // [outter_index, index[0], left_over]
+    // Therefore, input_index = outter_index * (stride * input_dim_size)
     //                        + index[0] * stride + left_over;
-    size_t outer_index = o / (stride * output_dim_size);
+    coord_t outter_index = o / (stride * output_dim_size);
     // coord_t index_2 = (o / stride) % dim_size
-    size_t left_over = o % stride;
-    size_t input_idx =
-        outer_index * (stride * input_dim_size) + index[o] * stride + left_over;
+    coord_t left_over = o % stride;
+    coord_t input_idx = outter_index * (stride * input_dim_size) +
+                        index[o] * stride + left_over;
     output[o] = input[input_idx];
   }
 }
@@ -52,24 +52,24 @@ template <typename IndexType>
 __global__ void gather_backward(float const *output_grad,
                                 IndexType const *index,
                                 float *input_grad,
-                                size_t output_size,
-                                size_t stride,
-                                size_t input_dim_size,
-                                size_t output_dim_size) {
+                                coord_t output_size,
+                                coord_t stride,
+                                coord_t input_dim_size,
+                                coord_t output_dim_size) {
   CUDA_KERNEL_LOOP(o, output_size) {
     // output tensor shape: [*, output_dim_size, stride]
     // output tensor stride: [output_dim_size * stride, stride, 1]
-    // output tensor index: [outer_index, index_2, left_over]
+    // output tensor index: [outter_index, index_2, left_over]
     // input tensor shape: [*, input_dim_size, stride]
     // input tensor stride: [input_dim_size * stride, stride, 1]
     // the index of the corresponding input tensor should be:
-    // [outer_index, index[0], left_over]
-    // Therefore, input_index = outer_index * (stride * input_dim_size)
+    // [outter_index, index[0], left_over]
+    // Therefore, input_index = outter_index * (stride * input_dim_size)
     //                        + index[0] * stride + left_over;
-    size_t outer_index = o / (stride * output_dim_size);
+    coord_t outer_index = o / (stride * output_dim_size);
     // coord_t index_2 = (o / stride) % dim_size
-    size_t left_over = o % stride;
-    size_t input_idx =
+    coord_t left_over = o % stride;
+    coord_t input_idx =
         outer_index * (stride * input_dim_size) + index[o] * stride + left_over;
 
     atomicAdd(&input_grad[input_idx], output_grad[o]);
@@ -78,100 +78,97 @@ __global__ void gather_backward(float const *output_grad,
 
 template <DataType IndexType>
 struct ForwardKernel {
-  void operator()(cudaStream_t stream,
-                  GatherPerDeviceState const &m,
+  void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorR const &index,
                   GenericTensorAccessorW const &output,
-                  size_t stride,
-                  size_t input_dim_size,
-                  size_t output_dim_size) {
-    /*size_t stride = 1;
-    for (int i = 0; i < m->legion_dim; i++) {
-      stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1);
-    }
-    size_t dim_size =
-        output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] +
-    1;
-*/
-    gather_forward<real_type<IndexType>>
-        <<<GET_BLOCKS(output.shape.get_volume()),
-           CUDA_NUM_THREADS,
-           0,
-           stream>>>(input.get<DataType::FLOAT>(),
-                     index.get<IndexType>(),
-                     output.get<DataType::FLOAT>(),
-                     output.shape.get_volume(),
-                     stride,
-                     input_dim_size,
-                     output_dim_size);
+                  coord_t output_size,
+                  coord_t stride,
+                  coord_t input_dim_size,
+                  coord_t output_dim_size) {
+    gather_forward<<<GET_BLOCKS(output_size), CUDA_NUM_THREADS, 0, stream>>>(
+        input.get_float_ptr(),
+        index.get<IndexType>(),
+        output.get_float_ptr(),
+        output_size,
+        stride,
+        input_dim_size,
+        output_dim_size);
   }
 };
 
-void forward_kernel(cudaStream_t stream,
+template <DataType IndexType>
+struct BackwardKernel {
+  void operator()(ffStream_t stream,
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &index,
+                  GenericTensorAccessorW const &input_grad,
+                  coord_t output_size,
+                  coord_t stride,
+                  coord_t input_dim_size,
+                  coord_t output_dim_size) {
+    gather_backward<<<GET_BLOCKS(output_size), CUDA_NUM_THREADS, 0, stream>>>(
+        output_grad.get_float_ptr(),
+        index.get<IndexType>(),
+        input_grad.get_float_ptr(),
+        output_size,
+        stride,
+        input_dim_size,
+        output_dim_size);
+  }
+};
+
+void forward_kernel(ffStream_t stream,
                     GatherPerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
-                    GenericTensorAccessorW const &output,
-                    size_t stride,
-                    size_t input_dim_size,
-                    size_t output_dim_size) {
-  DataTypeDispatch1<ForwardKernel>{}(m.index_data_type,
+                    GenericTensorAccessorW const &output) {
+  checkCUDA(get_legion_stream(&stream));
+  coord_t stride = 1;
+  for (int i = 0; i < m.legion_dim; i++) {
+    stride *= output.shape[legion_dim_t(i)] + 1;
+  }
+
+  coord_t output_dim_size = output.shape[legion_dim_t(m.legion_dim)] + 1;
+  coord_t input_dim_size = input.shape[legion_dim_t(m.legion_dim)] + 1;
+
+  assert(index.data_type == DataType::INT32 ||
+         index.data_type == DataType::INT64);
+
+  DataTypeDispatch1<ForwardKernel>{}(index.data_type,
                                      stream,
-                                     m,
                                      input,
                                      index,
                                      output,
+                                     output.shape.get_volume(),
                                      stride,
                                      input_dim_size,
                                      output_dim_size);
 }
 
-template <DataType IndexType>
-struct BackwardKernel {
-  void operator()(cudaStream_t stream,
-                  GatherPerDeviceState const &m,
-                  GenericTensorAccessorR const &output_grad,
-                  GenericTensorAccessorR const &index,
-                  GenericTensorAccessorW const &input_grad,
-                  size_t stride,
-                  size_t input_dim_size,
-                  size_t output_dim_size) {
-    /*size_t stride = 1;
-    for (int i = 0; i < m->legion_dim; i++) {
-      stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1);
-    }
-    size_t dim_size = output_grad.domain.hi()[m->legion_dim] -
-                      output_grad.domain.lo()[m->legion_dim] + 1;
-    */
-    gather_backward<real_type<IndexType>>
-        <<<GET_BLOCKS(output_grad.shape.get_volume()),
-           CUDA_NUM_THREADS,
-           0,
-           stream>>>(output_grad.get<DataType::FLOAT>(),
-                     index.get<IndexType>(),
-                     input_grad.get<DataType::FLOAT>(),
-                     output_grad.shape.get_volume(),
-                     stride,
-                     input_dim_size,
-                     output_dim_size);
-  }
-};
-
-void backward_kernel(cudaStream_t stream,
+void backward_kernel(ffStream_t stream,
                      GatherPerDeviceState const &m,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
-                     GenericTensorAccessorW const &input_grad,
-                     size_t stride,
-                     size_t input_dim_size,
-                     size_t output_dim_size) {
-  DataTypeDispatch1<BackwardKernel>{}(m.index_data_type,
+                     GenericTensorAccessorW const &input_grad) {
+  checkCUDA(get_legion_stream(&stream));
+  coord_t stride = 1;
+  for (int i = 0; i < m.legion_dim; i++) {
+    stride *= output_grad.shape[legion_dim_t(i)] + 1;
+  }
+
+  coord_t output_dim_size = output_grad.shape[legion_dim_t(m.legion_dim)] + 1;
+  coord_t input_dim_size = input_grad.shape[legion_dim_t(m.legion_dim)] + 1;
+
+  assert(index.data_type == DataType::INT32 ||
+         index.data_type == DataType::INT64);
+
+  DataTypeDispatch1<BackwardKernel>{}(index.data_type,
                                       stream,
-                                      m,
                                       output_grad,
                                       index,
                                       input_grad,
+                                      output_grad.shape.get_volume(),
                                       stride,
                                       input_dim_size,
                                       output_dim_size);
diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h
index f3c0e36cc1..6a3557e5b9 100644
--- a/lib/local-execution/include/profiling.h
+++ b/lib/local-execution/include/profiling.h
@@ -14,7 +14,7 @@ std::optional<float>
   std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
   if (elapsed.has_value()) {
-    spdlog::debug(elapsed.value());
+    spdlog::debug("{} kernel execution time: {}", s, elapsed.value());
   }
   return elapsed;
 }

From a2a7e0a6529a95bd38d21584368901d272fc7e86 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 30 May 2024 16:20:34 -0700
Subject: [PATCH 17/24] Finish gather operator

---
 lib/kernels/src/cuda/ops/gather_kernels.cu |  22 ++
 lib/local-execution/include/profiling.h    |   2 +-
 lib/local-execution/src/ops/gather.cc      | 225 +++++++++++
 lib/local-execution/src/ops/gather.h       |  30 ++
 lib/op-attrs/include/op-attrs/ops/gather.h |   4 +-
 lib/runtime/src/ops/gather.cc              | 416 ---------------------
 lib/runtime/src/ops/gather.h               |  78 ----
 7 files changed, 280 insertions(+), 497 deletions(-)
 create mode 100644 lib/local-execution/src/ops/gather.cc
 create mode 100644 lib/local-execution/src/ops/gather.h
 delete mode 100644 lib/runtime/src/ops/gather.cc
 delete mode 100644 lib/runtime/src/ops/gather.h

diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 829c952a49..dad14d89d7 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -124,6 +124,18 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
+
+  // Reference code for what's below -- not sure if I got the domain/array shape
+  // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) {
+  //   stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1);
+  // }
+  // coord_t output_dim_size =
+  //     output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] +
+  //     1;
+  // coord_t input_dim_size =
+  //     input.domain.hi()[m->legion_dim] - input.domain.lo()[m->legion_dim] +
+  //     1;
+
   coord_t stride = 1;
   for (int i = 0; i < m.legion_dim; i++) {
     stride *= output.shape[legion_dim_t(i)] + 1;
@@ -152,6 +164,16 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &index,
                      GenericTensorAccessorW const &input_grad) {
   checkCUDA(get_legion_stream(&stream));
+
+  // Reference code for what's below -- not sure if I got the domain/array shape
+  // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) {
+  //   stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1);
+  // }
+  // coord_t output_dim_size = output_grad.domain.hi()[m->legion_dim] -
+  //                           output_grad.domain.lo()[m->legion_dim] + 1;
+  // coord_t input_dim_size = input_grad.domain.hi()[m->legion_dim] -
+  //                          input_grad.domain.lo()[m->legion_dim] + 1;
+
   coord_t stride = 1;
   for (int i = 0; i < m.legion_dim; i++) {
     stride *= output_grad.shape[legion_dim_t(i)] + 1;
diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h
index 6a3557e5b9..24753ba203 100644
--- a/lib/local-execution/include/profiling.h
+++ b/lib/local-execution/include/profiling.h
@@ -14,7 +14,7 @@ std::optional<float>
   std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
   if (elapsed.has_value()) {
-    spdlog::debug("{} kernel execution time: {}", s, elapsed.value());
+    spdlog::debug("{}", s, elapsed.value());
   }
   return elapsed;
 }
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
new file mode 100644
index 0000000000..5f3acff2f2
--- /dev/null
+++ b/lib/local-execution/src/ops/gather.cc
@@ -0,0 +1,225 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gather.h"
+#include "kernels/gather_kernels.h"
+#include "op-attrs/get_output_shapes.h"
+#include <optional>
+
+namespace FlexFlow {
+
+using namespace FlexFlow::Kernels::Gather;
+
+enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE };
+
+OpTaskInvocation init(GatherAttrs const &attrs) {
+  OpTaskBinding binding;
+
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(INDEX, input_tensor(1));
+  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
+
+  return {GATHER_INIT_TASK_ID, binding};
+}
+
+OpTaskInvocation forward(GatherAttrs const &attrs) {
+  OpTaskBinding binding;
+
+  binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<GatherPerDeviceState>());
+
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INDEX, weight_tensor(0));
+
+  return {GATHER_FWD_TASK_ID, binding};
+}
+
+OpTaskInvocation backward(GatherAttrs const &attrs) {
+  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
+
+  return {GATHER_BWD_TASK_ID, binding};
+}
+
+static DeviceSpecific<GatherPerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto index = acc.get_tensor<Permissions::RO>(INDEX);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
+  int legion_dim = attrs.legion_dim;
+
+  // Reference code for what's below -- not sure if I got the domain/array shape stuff right
+  // assert(input.domain.get_dim() == index.domain.get_dim());
+  // assert(output.domain.get_dim() == index.domain.get_dim());
+  // for (int i = 0; i < input.domain.get_dim(); i++) {
+  //   assert(index.domain.hi()[i] == output.domain.hi()[i]);
+  //   assert(index.domain.lo()[i] == output.domain.lo()[i]);
+  //   if (i != m->legion_dim) {
+  //     assert(input.domain.hi()[i] == index.domain.hi()[i]);
+  //     assert(input.domain.lo()[i] == index.domain.lo()[i]);
+  //   }
+  // }
+
+  assert (input.shape.get_dim() == index.shape.get_dim());
+  assert (output.shape.get_dim() == index.shape.get_dim());
+
+  for (int i = 0; i < input.shape.get_dim(); i++) {
+    assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]);
+    if (i != legion_dim) {
+      assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]);
+    }
+  }
+
+  return DeviceSpecific<GatherPerDeviceState>({handle, legion_dim});
+}
+
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto per_device_state =
+      acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
+
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto index = acc.get_tensor<Permissions::RO>(INDEX);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[Gather] forward_time = %.2lfms\n",
+                 per_device_state,
+                 input,
+                 index,
+                 output);
+}
+
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto per_device_state =
+      acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
+
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto index = acc.get_tensor<Permissions::RO>(INDEX);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[Gather] forward_time = %.2lfms\n",
+                 per_device_state,
+                 output_grad,
+                 index,
+                 input_grad);
+}
+
+CostMetrics measure_operator_cost(SimEnvFactory const &sim,
+                                  GatherAttrs const &attrs,
+                                  InputParallelTensorDesc const &input_shape,
+                                  InputParallelTensorDesc const &index_shape,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &mv) {
+
+  auto env = sim.new_environment();
+
+  std::vector<ParallelTensorShape> output_shape =
+      get_output_shapes(attrs, input_shape.shape, index_shape.shape);
+
+  SimTaskBinding fwd_binding;
+  fwd_binding.bind_arg(PROFILING, settings);
+  fwd_binding.bind_arg(ATTRS, attrs);
+
+  fwd_binding.bind(INPUT, input_shape);
+  fwd_binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind(INDEX, index_shape);
+
+  SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+
+  auto fwd_accessor = env.get_fwd_accessor(GATHER_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(GATHER_BWD_TASK_ID, bwd_binding);
+
+  float forward_time = forward_task_impl(fwd_accessor).value();
+  float backward_time = backward_task_impl(bwd_accessor).value();
+
+  float sync_time = default_estimate_sync_time(env);
+  return make_metrics(forward_time, backward_time, sync_time, env);
+}
+
+template <>
+OpTaskSignature init_signature<GATHER_INIT_TASK_ID>() {
+  OpTaskSignature init(OpTaskType::INIT);
+
+  init.add_input_slot(INPUT);
+  init.add_input_slot(INDEX);
+  init.add_output_slot(OUTPUT);
+
+  init.add_arg_slot<GatherAttrs>(ATTRS);
+  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+
+  init.add_return_value<GatherPerDeviceState>();
+
+  return init;
+}
+
+template <>
+void register_task<GATHER_INIT_TASK_ID>() {
+  register_task(GATHER_INIT_TASK_ID,
+                "Gather Init",
+                init_signature<GATHER_INIT_TASK_ID>(),
+                init_task_impl);
+}
+
+template <>
+OpTaskSignature fwd_signature<GATHER_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FWD);
+
+  fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_arg_slot<GatherAttrs>(ATTRS);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+  fwd.add_weight_slot(INDEX);
+
+  return fwd;
+}
+
+template <>
+void register_task<GATHER_FWD_TASK_ID>() {
+  register_task(GATHER_FWD_TASK_ID,
+                "Gather Fwd",
+                fwd_signature<GATHER_FWD_TASK_ID>(),
+                forward_task_impl);
+}
+
+template <>
+OpTaskSignature bwd_signature<GATHER_BWD_TASK_ID>() {
+  OpTaskSignature bwd =
+      infer_bwd_signature(fwd_signature<GATHER_FWD_TASK_ID>());
+
+  return bwd;
+}
+
+template <>
+void register_task<GATHER_BWD_TASK_ID>() {
+  register_task(GATHER_BWD_TASK_ID,
+                "Gather Bwd",
+                bwd_signature<GATHER_BWD_TASK_ID>(),
+                backward_task_impl);
+}
+
+}; // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/src/ops/gather.h
new file mode 100644
index 0000000000..e83f768cb7
--- /dev/null
+++ b/lib/local-execution/src/ops/gather.h
@@ -0,0 +1,30 @@
+#ifndef _FLEXFLOW_GATHER_H
+#define _FLEXFLOW_GATHER_H
+
+#include "op-attrs/ops/gather.h"
+#include "op_task_invocation.h"
+#include "sim_environment.h"
+
+namespace FlexFlow {
+
+template <>
+void register_task<GATHER_INIT_TASK_ID>();
+template <>
+void register_task<GATHER_FWD_TASK_ID>();
+template <>
+void register_task<GATHER_BWD_TASK_ID>();
+
+OpTaskInvocation init(GatherAttrs const &);
+OpTaskInvocation forward(GatherAttrs const &);
+OpTaskInvocation backward(GatherAttrs const &);
+
+CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
+                                  GatherAttrs const &attrs,
+                                  InputParallelTensorDesc const &input,
+                                  InputParallelTensorDesc const &index,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index ca2406ef75..70dd65712a 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -9,9 +9,9 @@
 namespace FlexFlow {
 
 struct GatherAttrs {
-  ff_dim_t dim;
+  req<int> legion_dim;
 };
-FF_VISITABLE_STRUCT(GatherAttrs, dim);
+FF_VISITABLE_STRUCT(GatherAttrs, legion_dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/gather.cc b/lib/runtime/src/ops/gather.cc
deleted file mode 100644
index 9ef53ffc6a..0000000000
--- a/lib/runtime/src/ops/gather.cc
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gather.h"
-#include "embedding.h"
-#include "kernels/gather_kernels.h"
-#include "legion/legion_utilities.h"
-
-namespace FlexFlow {
-
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
-
-using namespace FlexFlow::Kernels::Gather;
-
-GatherParams Gather::get_params() const {
-  GatherParams params;
-  params.legion_dim = this->legion_dim;
-  params.layer_guid = this->layer_guid;
-  return params;
-}
-
-Tensor FFModel::gather(const Tensor input,
-                       const Tensor index,
-                       int dim,
-                       char const *name) {
-  Layer *gather = new Layer(this,
-                            OP_GATHER,
-                            DT_FLOAT,
-                            name,
-                            2 /*inputs*/,
-                            0 /*weights*/,
-                            1 /*output*/,
-                            input,
-                            index);
-  assert(index->data_type == DT_INT32 || index->data_type == DT_INT64);
-  assert(input->num_dims == index->num_dims);
-  int legion_dim = input->num_dims - 1 - dim;
-  // https://pytorch.org/docs/stable/generated/torch.gather.html
-  // Currently we assume index.size(d) == input.size(d) for all
-  // dimensions d != dim, which is a stronger constraint that PyTorch's
-  for (int i = 0; i < input->num_dims; i++) {
-    if (i != legion_dim) {
-      assert(input->dims[i] == index->dims[i]);
-    }
-  }
-  int dims[MAX_TENSOR_DIM];
-  for (int i = 0; i < index->num_dims; i++) {
-    dims[i] = index->dims[i];
-  }
-  gather->outputs[0] = create_tensor_legion_ordering(
-      index->num_dims, dims, input->data_type, gather, 0, true /*create_grad*/);
-  gather->add_int_property("legion_dim", legion_dim);
-  layers.push_back(gather);
-  return gather->outputs[0];
-}
-
-Op *Gather::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("legion_dim", value);
-  int legion_dim = value;
-  return new Gather(
-      model, layer->layer_guid, inputs[0], inputs[1], legion_dim, layer->name);
-}
-
-Gather::Gather(FFModel &model,
-               GatherParams const &params,
-               std::pair<ParallelTensor, ParallelTensor> const &inputs,
-               char const *name)
-    : Gather(model,
-             params.layer_guid,
-             inputs.first,
-             inputs.second,
-             params.legion_dim,
-             name) {}
-
-Gather::Gather(FFModel &model,
-               LayerID const &_layer_guid,
-               const ParallelTensor input,
-               const ParallelTensor index,
-               int _legion_dim,
-               char const *name)
-    : Op(model,
-         OP_GATHER,
-         input->data_type,
-         name,
-         2 /*inputs*/,
-         0 /*weights*/,
-         1 /*outputs*/,
-         input,
-         index),
-      legion_dim(_legion_dim) {
-  layer_guid = _layer_guid;
-  // Assume that input and index have the same paralleldim except
-  // for the legion_dim-th dim, which cannot be parallelized
-  for (int i = 0; i < input->num_dims; i++) {
-    if (i != legion_dim) {
-      assert(input->dims[i] == index->dims[i]);
-    }
-  }
-  assert(index->dims[legion_dim].degree == 1);
-  assert(input->dims[legion_dim].degree == 1);
-  // output has the same parallel dims as index
-  ParallelDim dims[MAX_TENSOR_DIM];
-  for (int i = 0; i < index->num_dims; i++) {
-    dims[i] = index->dims[i];
-  }
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      index->num_dims, dims, input->data_type, this);
-}
-
-void Gather::serialize(Legion::Serializer &sez) const {
-  GatherParams params = get_params();
-  sez.serialize(params.legion_dim);
-  sez.serialize(this->layer_guid.id);
-}
-
-using PCG::Node;
-/*static*/
-Node Gather::deserialize(FFModel &ff,
-                         Legion::Deserializer &dez,
-                         ParallelTensor inputs[],
-                         int num_inputs) {
-  assert(num_inputs == 2);
-  int legion_dim;
-  dez.deserialize(legion_dim);
-  size_t id;
-  dez.deserialize(id);
-  LayerID layer_guid(id);
-
-  GatherParams params;
-  params.legion_dim = legion_dim;
-  params.layer_guid = layer_guid;
-  return ff.get_or_create_node<Gather>({inputs[0], inputs[1]}, params);
-}
-
-Op *Gather::materialize(FFModel &ff,
-                        ParallelTensor inputs[],
-                        int num_inputs) const {
-  GatherParams params = get_params();
-  return new Gather(ff, params, {inputs[0], inputs[1]}, this->name);
-}
-
-void Gather::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(GATHER_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(Gather)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
-
-PerDeviceOpState *Gather::init_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  Gather const *gather = (Gather const *)task->args;
-  FFHandler handle = *((FFHandler const *)task->local_args);
-  GatherMeta *m = new GatherMeta(handle, gather);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR index = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  assert(input.domain.get_dim() == index.domain.get_dim());
-  assert(output.domain.get_dim() == index.domain.get_dim());
-  for (int i = 0; i < input.domain.get_dim(); i++) {
-    assert(index.domain.hi()[i] == output.domain.hi()[i]);
-    assert(index.domain.lo()[i] == output.domain.lo()[i]);
-    if (i != m->legion_dim) {
-      assert(input.domain.hi()[i] == index.domain.hi()[i]);
-      assert(input.domain.lo()[i] == index.domain.lo()[i]);
-    }
-  }
-  return m;
-}
-
-void Gather::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(GATHER_FWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, false),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
-
-void Gather::forward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  GatherMeta const *m = *((GatherMeta **)task->local_args);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR index = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input, index, output);
-}
-
-void Gather::backward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
-  IndexLauncher launcher(GATHER_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
-  launcher.add_field(2, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
-
-void Gather::backward_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  GatherMeta const *m = *((GatherMeta **)task->local_args);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR index = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  backward_kernel_wrapper(m, output_grad, index, input_grad);
-}
-
-bool Gather::measure_operator_cost(Simulator *sim,
-                                   MachineView const &mv,
-                                   CostMetrics &cost_metrics) const {
-  ParallelTensorBase sub_input, sub_index, sub_output;
-  if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-    return false;
-  }
-  if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-    return false;
-  }
-  if (!inputs[1]->get_sub_tensor(mv, sub_index)) {
-    return false;
-  }
-  GatherMeta *m = new GatherMeta(sim->handler, this);
-  sim->free_all();
-  bool out_of_memory = false;
-  Domain input_domain = sub_input.get_domain();
-  void *input_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW input_acc(
-      inputs[0]->data_type, input_domain, input_ptr);
-  Domain index_domain = sub_index.get_domain();
-  void *index_ptr = sim->allocate(sub_index.get_volume(), inputs[1]->data_type);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW index_acc(
-      inputs[1]->data_type, index_domain, index_ptr);
-  out_of_memory = out_of_memory || (input_ptr == NULL) || (index_ptr == NULL);
-  Domain out_domain = sub_output.get_domain();
-  void *output_ptr =
-      sim->allocate(sub_output.get_volume(), outputs[0]->data_type);
-  out_of_memory = out_of_memory || (output_ptr == NULL);
-  cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW output_acc(
-      outputs[0]->data_type, out_domain, output_ptr);
-  if (out_of_memory) {
-    cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-    cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-    return true;
-  }
-
-  std::function<void()> forward, backward;
-  forward = [&] {
-    forward_kernel_wrapper(m, input_acc, index_acc, output_acc);
-  };
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    backward = [&] {
-      backward_kernel_wrapper(m, output_acc, index_acc, input_acc);
-    };
-  }
-
-  inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    printf("[Measure Gather] name(%s) forward_time(%.4lf) "
-           "backward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time,
-           cost_metrics.backward_time);
-  } else {
-    printf("[Measure Gather] name(%s) forward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time);
-  }
-  delete m;
-  return true;
-}
-
-}; // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::GatherParams>::operator()(
-    FlexFlow::GatherParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.legion_dim);
-  hash_combine(key, params.layer_guid.id);
-  return key;
-}
-}; // namespace std
diff --git a/lib/runtime/src/ops/gather.h b/lib/runtime/src/ops/gather.h
deleted file mode 100644
index 1ea20b71f5..0000000000
--- a/lib/runtime/src/ops/gather.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef _FLEXFLOW_OPS_GATHER_H
-#define _FLEXFLOW_OPS_GATHER_H
-
-#include "op-attrs/ops/gather.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
-
-namespace FlexFlow {
-
-template <>
-void register_task<GATHER_INIT_TASK_ID>();
-template <>
-void register_task<GATHER_FWD_TASK_ID>();
-template <>
-void register_task<GATHER_BWD_TASK_ID>();
-
-OpTaskInvocation init(GatherAttrs const &);
-OpTaskInvocation forward(GatherAttrs const &);
-OpTaskInvocation backward(GatherAttrs const &);
-
-CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
-                                  GatherAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
-                                  ParallelTensorShape const &index_shape,
-                                  ProfilingSettings const &settings,
-                                  MachineView const &machine_view);
-
-/* class Gather : public Op { */
-/* public: */
-/*   Gather(FFModel &model, */
-/*          ParallelTensor const &input, */
-/*          ParallelTensor const &index, */
-/*          int legion_dim, */
-/*          char const *name = nullptr); */
-/*   void init(FFModel const &) override; */
-/*   void forward(FFModel const &) override; */
-/*   void backward(FFModel const &) override; */
-
-/*   static Op * */
-/*       create_operator_from_layer(FFModel &model, */
-/*                                  Layer const *layer, */
-/*                                  std::vector<ParallelTensor> const &inputs);
- */
-
-/*   static PerDeviceOpState *init_task(Legion::Task const *task, */
-/*                            std::vector<Legion::PhysicalRegion> const
- * &regions, */
-/*                            Legion::Context ctx, */
-/*                            Legion::Runtime *runtime); */
-/*   static void forward_task(Legion::Task const *task, */
-/*                            std::vector<Legion::PhysicalRegion> const
- * &regions, */
-/*                            Legion::Context ctx, */
-/*                            Legion::Runtime *runtime); */
-/*   static void backward_task(Legion::Task const *task, */
-/*                             std::vector<Legion::PhysicalRegion> const
- * &regions, */
-/*                             Legion::Context ctx, */
-/*                             Legion::Runtime *runtime); */
-/*   bool measure_operator_cost(Simulator *sim, */
-/*                              MachineView const &pc, */
-/*                              CostMetrics &cost_metrics) const override; */
-/*   void serialize(Legion::Serializer &s) const override; */
-/*   /1* static PCG::Node deserialize(FFModel &ff, *1/ */
-/*   /1*                              Legion::Deserializer &d, *1/ */
-/*   /1*                              ParallelTensor inputs[], *1/ */
-/*   /1*                              int num_inputs); *1/ */
-/*   Op *materialize(FFModel &ff, */
-/*                   ParallelTensor inputs[], */
-/*                   int num_inputs) const override; */
-
-/* public: */
-/*   int legion_dim; */
-/* }; */
-
-} // namespace FlexFlow
-
-#endif

From e0b259cdd75d250f8f44e8e3a6477d7438036776 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 30 May 2024 16:26:32 -0700
Subject: [PATCH 18/24] Format

---
 lib/local-execution/src/ops/gather.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index 5f3acff2f2..566fef2785 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -57,7 +57,8 @@ OpTaskInvocation backward(GatherAttrs const &attrs) {
   return {GATHER_BWD_TASK_ID, binding};
 }
 
-static DeviceSpecific<GatherPerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+static DeviceSpecific<GatherPerDeviceState>
+    init_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto index = acc.get_tensor<Permissions::RO>(INDEX);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -66,8 +67,8 @@ static DeviceSpecific<GatherPerDeviceState> init_task_impl(TaskArgumentAccessor
   auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
   int legion_dim = attrs.legion_dim;
 
-  // Reference code for what's below -- not sure if I got the domain/array shape stuff right
-  // assert(input.domain.get_dim() == index.domain.get_dim());
+  // Reference code for what's below -- not sure if I got the domain/array shape
+  // stuff right assert(input.domain.get_dim() == index.domain.get_dim());
   // assert(output.domain.get_dim() == index.domain.get_dim());
   // for (int i = 0; i < input.domain.get_dim(); i++) {
   //   assert(index.domain.hi()[i] == output.domain.hi()[i]);
@@ -78,8 +79,8 @@ static DeviceSpecific<GatherPerDeviceState> init_task_impl(TaskArgumentAccessor
   //   }
   // }
 
-  assert (input.shape.get_dim() == index.shape.get_dim());
-  assert (output.shape.get_dim() == index.shape.get_dim());
+  assert(input.shape.get_dim() == index.shape.get_dim());
+  assert(output.shape.get_dim() == index.shape.get_dim());
 
   for (int i = 0; i < input.shape.get_dim(); i++) {
     assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]);

From 55971f26764bcbb1261bdcd5772cf7ebc5a3bcc6 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 30 May 2024 20:11:42 -0700
Subject: [PATCH 19/24] Fix substitutions

---
 lib/substitutions/src/operator_attributes.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/substitutions/src/operator_attributes.cc b/lib/substitutions/src/operator_attributes.cc
index 8bd8688194..be5d63024e 100644
--- a/lib/substitutions/src/operator_attributes.cc
+++ b/lib/substitutions/src/operator_attributes.cc
@@ -129,7 +129,7 @@ std::optional<OperatorAttributeValue> get_attribute(GatherAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::AXIS:
-      return p.dim;
+      return p.legion_dim;
     default:
       return std::nullopt;
   }

From da38f0a0901e18aefc56a28446a2ae90399e996e Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Fri, 31 May 2024 18:57:59 -0700
Subject: [PATCH 20/24] Fix legion dim in gather

---
 lib/kernels/include/kernels/array_shape.h     |  2 +-
 lib/kernels/include/kernels/gather_kernels.h  |  2 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    | 64 ++++++-------------
 .../include}/legion_tensor_shape.h            |  4 +-
 lib/local-execution/include/profiling.h       |  2 +-
 lib/local-execution/src/ops/gather.cc         | 17 +----
 lib/op-attrs/include/op-attrs/ops/gather.h    |  4 +-
 lib/substitutions/src/operator_attributes.cc  |  2 +-
 8 files changed, 32 insertions(+), 65 deletions(-)
 rename lib/{runtime/src => local-execution/include}/legion_tensor_shape.h (92%)

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 36796bc504..15f14f8757 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -42,7 +42,7 @@ struct ArrayShape {
 
   ArrayShape reversed_dim_order() const;
   ArrayShape sub_shape(std::optional<legion_dim_t> start,
-                       std::optional<legion_dim_t> end);
+                       std::optional<legion_dim_t> end) const;
 
 public:
   LegionTensorDims dims;
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 305ccc8e26..13bf4b898a 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -8,7 +8,7 @@ namespace FlexFlow {
 
 struct GatherPerDeviceState {
   PerDeviceFFHandle handle;
-  int legion_dim;
+  legion_dim_t legion_dim;
 };
 
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index dad14d89d7..286acf7376 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -32,18 +32,18 @@ __global__ void gather_forward(float const *input,
   CUDA_KERNEL_LOOP(o, output_size) {
     // output tensor shape: [*, output_dim_size, stride]
     // output tensor stride: [output_dim_size * stride, stride, 1]
-    // output tensor index: [outter_index, index_2, left_over]
+    // output tensor index: [outer_index, index_2, left_over]
     // input tensor shape: [*, input_dim_size, stride]
     // input tensor stride: [input_dim_size * stride, stride, 1]
     // the index of the corresponding input tensor should be:
-    // [outter_index, index[0], left_over]
-    // Therefore, input_index = outter_index * (stride * input_dim_size)
+    // [outer_index, index[0], left_over]
+    // Therefore, input_index = outer_index * (stride * input_dim_size)
     //                        + index[0] * stride + left_over;
-    coord_t outter_index = o / (stride * output_dim_size);
+    coord_t outer_index = o / (stride * output_dim_size);
     // coord_t index_2 = (o / stride) % dim_size
     coord_t left_over = o % stride;
-    coord_t input_idx = outter_index * (stride * input_dim_size) +
-                        index[o] * stride + left_over;
+    coord_t input_idx =
+        outer_index * (stride * input_dim_size) + index[o] * stride + left_over;
     output[o] = input[input_idx];
   }
 }
@@ -59,12 +59,12 @@ __global__ void gather_backward(float const *output_grad,
   CUDA_KERNEL_LOOP(o, output_size) {
     // output tensor shape: [*, output_dim_size, stride]
     // output tensor stride: [output_dim_size * stride, stride, 1]
-    // output tensor index: [outter_index, index_2, left_over]
+    // output tensor index: [outer_index, index_2, left_over]
     // input tensor shape: [*, input_dim_size, stride]
     // input tensor stride: [input_dim_size * stride, stride, 1]
     // the index of the corresponding input tensor should be:
-    // [outter_index, index[0], left_over]
-    // Therefore, input_index = outter_index * (stride * input_dim_size)
+    // [outer_index, index[0], left_over]
+    // Therefore, input_index = outer_index * (stride * input_dim_size)
     //                        + index[0] * stride + left_over;
     coord_t outer_index = o / (stride * output_dim_size);
     // coord_t index_2 = (o / stride) % dim_size
@@ -125,24 +125,12 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
 
-  // Reference code for what's below -- not sure if I got the domain/array shape
-  // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) {
-  //   stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1);
-  // }
-  // coord_t output_dim_size =
-  //     output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] +
-  //     1;
-  // coord_t input_dim_size =
-  //     input.domain.hi()[m->legion_dim] - input.domain.lo()[m->legion_dim] +
-  //     1;
-
-  coord_t stride = 1;
-  for (int i = 0; i < m.legion_dim; i++) {
-    stride *= output.shape[legion_dim_t(i)] + 1;
-  }
-
-  coord_t output_dim_size = output.shape[legion_dim_t(m.legion_dim)] + 1;
-  coord_t input_dim_size = input.shape[legion_dim_t(m.legion_dim)] + 1;
+  coord_t stride =
+      output.shape
+          .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1})
+          .get_volume();
+  coord_t output_dim_size = output.shape[m.legion_dim];
+  coord_t input_dim_size = input.shape[m.legion_dim];
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
@@ -165,22 +153,12 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &input_grad) {
   checkCUDA(get_legion_stream(&stream));
 
-  // Reference code for what's below -- not sure if I got the domain/array shape
-  // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) {
-  //   stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1);
-  // }
-  // coord_t output_dim_size = output_grad.domain.hi()[m->legion_dim] -
-  //                           output_grad.domain.lo()[m->legion_dim] + 1;
-  // coord_t input_dim_size = input_grad.domain.hi()[m->legion_dim] -
-  //                          input_grad.domain.lo()[m->legion_dim] + 1;
-
-  coord_t stride = 1;
-  for (int i = 0; i < m.legion_dim; i++) {
-    stride *= output_grad.shape[legion_dim_t(i)] + 1;
-  }
-
-  coord_t output_dim_size = output_grad.shape[legion_dim_t(m.legion_dim)] + 1;
-  coord_t input_dim_size = input_grad.shape[legion_dim_t(m.legion_dim)] + 1;
+  coord_t stride =
+      output_grad.shape
+          .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1})
+          .get_volume();
+  coord_t output_dim_size = output_grad.shape[m.legion_dim];
+  coord_t input_dim_size = input_grad.shape[m.legion_dim];
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
diff --git a/lib/runtime/src/legion_tensor_shape.h b/lib/local-execution/include/legion_tensor_shape.h
similarity index 92%
rename from lib/runtime/src/legion_tensor_shape.h
rename to lib/local-execution/include/legion_tensor_shape.h
index 1f5fab76a6..ff96ba9a15 100644
--- a/lib/runtime/src/legion_tensor_shape.h
+++ b/lib/local-execution/include/legion_tensor_shape.h
@@ -28,8 +28,8 @@ struct LegionTensorShape : public use_visitable_cmp<LegionTensorShape>,
   DataType data_type;
 };
 
-ff_dim_t to_ff(legion_dim_t, int num_dims);
-legion_dim_t to_legion(ff_dim_t, int num_dims);
+ff_dim_t to_ff(legion_dim_t, size_t num_dims);
+legion_dim_t to_legion(ff_dim_t, size_t num_dims);
 
 ff_dim_t to_ff(legion_dim_t, TensorShape const &);
 legion_dim_t to_legion(ff_dim_t, TensorShape const &);
diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h
index 24753ba203..bd50801fc4 100644
--- a/lib/local-execution/include/profiling.h
+++ b/lib/local-execution/include/profiling.h
@@ -14,7 +14,7 @@ std::optional<float>
   std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
   if (elapsed.has_value()) {
-    spdlog::debug("{}", s, elapsed.value());
+    spdlog::debug(s, elapsed.value());
   }
   return elapsed;
 }
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index 566fef2785..0f53348cbe 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -15,6 +15,7 @@
 
 #include "gather.h"
 #include "kernels/gather_kernels.h"
+#include "legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
 #include <optional>
 
@@ -65,26 +66,14 @@ static DeviceSpecific<GatherPerDeviceState>
 
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
-  int legion_dim = attrs.legion_dim;
-
-  // Reference code for what's below -- not sure if I got the domain/array shape
-  // stuff right assert(input.domain.get_dim() == index.domain.get_dim());
-  // assert(output.domain.get_dim() == index.domain.get_dim());
-  // for (int i = 0; i < input.domain.get_dim(); i++) {
-  //   assert(index.domain.hi()[i] == output.domain.hi()[i]);
-  //   assert(index.domain.lo()[i] == output.domain.lo()[i]);
-  //   if (i != m->legion_dim) {
-  //     assert(input.domain.hi()[i] == index.domain.hi()[i]);
-  //     assert(input.domain.lo()[i] == index.domain.lo()[i]);
-  //   }
-  // }
+  legion_dim_t legion_dim = to_legion(attrs.ff_dim, input.shape.num_dims());
 
   assert(input.shape.get_dim() == index.shape.get_dim());
   assert(output.shape.get_dim() == index.shape.get_dim());
 
   for (int i = 0; i < input.shape.get_dim(); i++) {
     assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]);
-    if (i != legion_dim) {
+    if (i != legion_dim.value()) {
       assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]);
     }
   }
diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index 70dd65712a..85a732e3d6 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -9,9 +9,9 @@
 namespace FlexFlow {
 
 struct GatherAttrs {
-  req<int> legion_dim;
+  req<ff_dim_t> ff_dim;
 };
-FF_VISITABLE_STRUCT(GatherAttrs, legion_dim);
+FF_VISITABLE_STRUCT(GatherAttrs, ff_dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/operator_attributes.cc b/lib/substitutions/src/operator_attributes.cc
index be5d63024e..2c4cdfbcd8 100644
--- a/lib/substitutions/src/operator_attributes.cc
+++ b/lib/substitutions/src/operator_attributes.cc
@@ -129,7 +129,7 @@ std::optional<OperatorAttributeValue> get_attribute(GatherAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::AXIS:
-      return p.legion_dim;
+      return p.ff_dim;
     default:
       return std::nullopt;
   }

From 5f539a3a1edcf72bdb1ec58b099fd0053f0789a0 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sat, 1 Jun 2024 16:00:38 -0700
Subject: [PATCH 21/24] Format string fixes

---
 lib/local-execution/src/ops/attention.cc      | 4 ++--
 lib/local-execution/src/ops/batch_matmul.cc   | 4 ++--
 lib/local-execution/src/ops/batch_norm.cc     | 4 ++--
 lib/local-execution/src/ops/cast.cc           | 4 ++--
 lib/local-execution/src/ops/combine.cc        | 4 ++--
 lib/local-execution/src/ops/concat.cc         | 4 ++--
 lib/local-execution/src/ops/conv_2d.cc        | 4 ++--
 lib/local-execution/src/ops/dropout.cc        | 4 ++--
 lib/local-execution/src/ops/element_binary.cc | 4 ++--
 lib/local-execution/src/ops/element_unary.cc  | 4 ++--
 lib/local-execution/src/ops/embedding.cc      | 4 ++--
 lib/local-execution/src/ops/flat.cc           | 4 ++--
 lib/local-execution/src/ops/gather.cc         | 6 +++---
 lib/local-execution/src/ops/layer_norm.cc     | 4 ++--
 lib/local-execution/src/ops/linear.cc         | 4 ++--
 lib/local-execution/src/ops/partition.cc      | 4 ++--
 lib/local-execution/src/ops/pool_2d.cc        | 4 ++--
 lib/local-execution/src/ops/reduce.cc         | 4 ++--
 lib/local-execution/src/ops/reduction.cc      | 4 ++--
 lib/local-execution/src/ops/replicate.cc      | 4 ++--
 lib/local-execution/src/ops/reshape.cc        | 4 ++--
 lib/local-execution/src/ops/reverse.cc        | 4 ++--
 lib/local-execution/src/ops/softmax.cc        | 4 ++--
 lib/local-execution/src/ops/split.cc          | 4 ++--
 lib/local-execution/src/ops/topk.cc           | 4 ++--
 lib/local-execution/src/ops/transpose.cc      | 4 ++--
 lib/op-attrs/include/op-attrs/ops/gather.h    | 4 ++--
 lib/substitutions/src/operator_attributes.cc  | 2 +-
 28 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index 414b71ec70..3f11829d2f 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -168,7 +168,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[MultiHeadAttention] forward_time = %.2lfms\n",
+                 "[MultiHeadAttention] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  query.get_float_ptr(),
                  key.get_float_ptr(),
@@ -207,7 +207,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[MultiHeadAttention] backward_time = %.2lfms\n",
+                 "[MultiHeadAttention] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  query.get_float_ptr(),
                  query_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index eccbe5a475..76bc88eae6 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -85,7 +85,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[BatchMatmul] forward_time = %.2lfms\n",
+                 "[BatchMatmul] forward_time = {:.2lf}ms\n",
                  handle,
                  output.get_float_ptr(),
                  a_input.get_float_ptr(),
@@ -138,7 +138,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[BatchMatmul] backward_time = %.2lfms\n",
+                 "[BatchMatmul] backward_time = {:.2lf}ms\n",
                  handle,
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 5e640d70e0..97830f90fe 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -106,7 +106,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[BatchNorm] forward_time = %.2lfms\n",
+                 "[BatchNorm] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
@@ -130,7 +130,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[BatchNorm] backward_time = %.2lfms\n",
+                 "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 5647d7e7f2..7a74c8824d 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -52,7 +52,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Cast] forward_time = %.2lfms\n",
+                 "[Cast] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  input.data_type,
@@ -71,7 +71,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Cast] forward_time = %.2lfms\n",
+                 "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad,
                  input.data_type,
diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
index 0bce55722a..a39a503333 100644
--- a/lib/local-execution/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -50,7 +50,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Combine] forward_time = %.2lfms\n",
+                 "[Combine] forward_time = {:.2lf}ms\n",
                  input,
                  output);
 }
@@ -64,7 +64,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Combine] forward_time = %.2lfms\n",
+                 "[Combine] forward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad);
 }
diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
index 087f08b577..3cbc232fac 100644
--- a/lib/local-execution/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -54,7 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Concat] forward_time = %.2lfms\n",
+                 "[Concat] forward_time = {:.2lf}ms\n",
                  output,
                  inputs,
                  attrs.axis);
@@ -72,7 +72,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Concat] backward_time = %.2lfms\n",
+                 "[Concat] backward_time = {:.2lf}ms\n",
                  output_grad,
                  input_grads,
                  attrs.axis);
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index a53b259fac..eef4c21a45 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -92,7 +92,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Conv2d] forward_time = %.2lfms\n",
+                 "[Conv2d] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
@@ -119,7 +119,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Conv2d] backward_time = %.2lfms\n",
+                 "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  input_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
index 4935091ee5..3db1e7b8eb 100644
--- a/lib/local-execution/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -61,7 +61,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Dropout] forward_time = %.2lfms\n",
+                 "[Dropout] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
@@ -79,7 +79,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Dropout] backward_time = %.2lfms\n",
+                 "[Dropout] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr());
diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc
index b5588e04fd..a2e9ee2ba8 100644
--- a/lib/local-execution/src/ops/element_binary.cc
+++ b/lib/local-execution/src/ops/element_binary.cc
@@ -84,7 +84,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[ElementBinary] forward_time = %.2lfms\n",
+                 "[ElementBinary] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input_lhs.get_float_ptr(),
                  input_rhs.get_float_ptr(),
@@ -111,7 +111,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[ElementBinary] backward_time = %.2lfms\n",
+                 "[ElementBinary] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
                  input_lhs.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index ddec57414a..2ad5d797f5 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -75,7 +75,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[ElementUnary] forward_time = %.2lfms\n",
+                 "[ElementUnary] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  attrs,
                  handle,
@@ -99,7 +99,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[ElementUnary] backward_time = %.2lfms\n",
+                 "[ElementUnary] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  attrs,
                  handle,
diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc
index bac48c4b24..6ce13d88c9 100644
--- a/lib/local-execution/src/ops/embedding.cc
+++ b/lib/local-execution/src/ops/embedding.cc
@@ -53,7 +53,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Embedding] forward_time = %.2lfms\n",
+                 "[Embedding] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight,
@@ -76,7 +76,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Embedding] forward_time = %.2lfms\n",
+                 "[Embedding] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight_grad,
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 9849bd3b73..194d84aaa8 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -33,7 +33,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Flat] forward_time = %.2lfms\n",
+                 "[Flat] forward_time = {:.2lf}ms\n",
                  input,
                  output.get_float_ptr());
 }
@@ -48,7 +48,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Flat] forward_time = %.2lfms\n",
+                 "[Flat] forward_time = {:.2lf}ms\n",
                  input,
                  input_grad.get_float_ptr(),
                  output_grad.get_float_ptr());
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index 0f53348cbe..091abd0ed3 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -66,7 +66,7 @@ static DeviceSpecific<GatherPerDeviceState>
 
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
-  legion_dim_t legion_dim = to_legion(attrs.ff_dim, input.shape.num_dims());
+  legion_dim_t legion_dim = to_legion(attrs.dim, input.shape.num_dims());
 
   assert(input.shape.get_dim() == index.shape.get_dim());
   assert(output.shape.get_dim() == index.shape.get_dim());
@@ -92,7 +92,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Gather] forward_time = %.2lfms\n",
+                 "[Gather] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input,
                  index,
@@ -111,7 +111,7 @@ static std::optional<float>
 
   return profile(forward_kernel,
                  profiling,
-                 "[Gather] forward_time = %.2lfms\n",
+                 "[Gather] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad,
                  index,
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index fb97f946eb..620758772c 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -78,7 +78,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[LayerNorm] forward time = %.2lfms\n",
+                 "[LayerNorm] forward time = {:.2lf}ms\n",
                  state,
                  input,
                  output,
@@ -101,7 +101,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[LayerNorm] backward time = %.2lfms\n",
+                 "[LayerNorm] backward time = {:.2lf}ms\n",
                  state,
                  output_grad,
                  input,
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 08e8fa3f68..36533be211 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -105,7 +105,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Linear] forward_time = %.2lfms\n",
+                 "[Linear] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
@@ -144,7 +144,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Linear] backward_time = %.2lfms\n",
+                 "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  (void *)input.get_float_ptr(),
                  (void *)input_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc
index 1d358b52f5..4b09ad026b 100644
--- a/lib/local-execution/src/ops/partition.cc
+++ b/lib/local-execution/src/ops/partition.cc
@@ -73,7 +73,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Reparition/Partition] forward_time = %.2lfms\n",
+                 "[Reparition/Partition] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input,
                  output);
@@ -89,7 +89,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Reparition/Partition] backward_time = %.2lfms\n",
+                 "[Reparition/Partition] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad,
                  input_grad);
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 576a5a8d23..989f390380 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -113,7 +113,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Pool2D] forward_time = %.2lfms\n",
+                 "[Pool2D] forward_time = {:.2lf}ms\n",
                  state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
@@ -132,7 +132,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Pool2D] backward_time = %.2lfms\n",
+                 "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
                  input.get_float_ptr(),
                  input_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index 0ccd7be6e3..98d1a6f522 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -83,7 +83,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Reduce] forward_time = %.2lfms\n",
+                 "[Reduce] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
@@ -119,7 +119,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Reduce] backward_time = %.2lfms\n",
+                 "[Reduce] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr());
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index 86f300df63..3fa300f64d 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -55,7 +55,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling_settings,
-                 "[Reduction] forward_time = %.2lfms\n",
+                 "[Reduction] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  num_replicas);
@@ -69,7 +69,7 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   return profile(backward_kernel,
                  profiling,
-                 "[Reduction] backward_time = %.2lfms\n",
+                 "[Reduction] backward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad);
 }
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 3322f8a1ce..a441985b78 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -54,7 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[replicate] forward_time = %.2lfms\n",
+                 "[replicate] forward_time = {:.2lf}ms\n",
                  input,
                  output);
 }
@@ -69,7 +69,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[replicate] backward_time = %.2lfms\n",
+                 "[replicate] backward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad,
                  attrs.replicate_degree);
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index c53fe5d78b..efee73645b 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -69,7 +69,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Reshape] forward time = %.2lfms\n",
+                 "[Reshape] forward time = {:.2lf}ms\n",
                  per_device_state,
                  input,
                  output);
@@ -86,7 +86,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Reshape] backward time = %.2lfms\n",
+                 "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
                  input_grad,
                  output_grad);
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 49f1e51076..7fefb3d357 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -63,7 +63,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[reverse] forward_time = %.2lfms\n",
+                 "[reverse] forward_time = {:.2lf}ms\n",
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  num_out_blks,
@@ -93,7 +93,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[reverse] backward_time = %.2lfms\n",
+                 "[reverse] backward_time = {:.2lf}ms\n",
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  num_out_blks,
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 5a65127140..ea857c680b 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -72,7 +72,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[SoftMax] forward_time = %.2lfms\n",
+                 "[SoftMax] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
@@ -93,7 +93,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[SoftMax] backward_time = %.2lfms\n",
+                 "[SoftMax] backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
                  output_grad.shape.get_volume());
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index ffb40515ad..13e95d37f9 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -76,7 +76,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
                  profiling,
-                 "Split forward_time = %.2lfms\n",
+                 "Split forward_time = {:.2lf}ms\n",
                  &output_float_ptr,
                  input.get_float_ptr(),
                  out_block_size,
@@ -106,7 +106,7 @@ static std::optional<float>
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
                  profiling,
-                 "Split backward_time = %.2lfms\n",
+                 "Split backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
                  &output_grad_ptr,
                  out_block_size,
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
index f6783a2d6c..8aceb9c6d4 100644
--- a/lib/local-execution/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -81,7 +81,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[TopK] forward_time = %.2lfms\n",
+                 "[TopK] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
@@ -109,7 +109,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[TopK] backward_time = %.2lfms\n",
+                 "[TopK] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
                  indices.get_int32_ptr(),
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index f580a46792..c998484455 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -84,7 +84,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Transpose] Forward_time = %.2lf [ms]",
+                 "[Transpose] Forward_time = {:.2lf} [ms]",
                  per_device_state,
                  input,
                  output);
@@ -101,7 +101,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Transpose] Backward_time = %.2lf [ms]",
+                 "[Transpose] Backward_time = {:.2lf} [ms]",
                  per_device_state,
                  input_grad,
                  output_grad);
diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h
index 85a732e3d6..ca2406ef75 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather.h
+++ b/lib/op-attrs/include/op-attrs/ops/gather.h
@@ -9,9 +9,9 @@
 namespace FlexFlow {
 
 struct GatherAttrs {
-  req<ff_dim_t> ff_dim;
+  ff_dim_t dim;
 };
-FF_VISITABLE_STRUCT(GatherAttrs, ff_dim);
+FF_VISITABLE_STRUCT(GatherAttrs, dim);
 CHECK_VALID_OP_ATTR(GatherAttrs);
 
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/operator_attributes.cc b/lib/substitutions/src/operator_attributes.cc
index 2c4cdfbcd8..8bd8688194 100644
--- a/lib/substitutions/src/operator_attributes.cc
+++ b/lib/substitutions/src/operator_attributes.cc
@@ -129,7 +129,7 @@ std::optional<OperatorAttributeValue> get_attribute(GatherAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::AXIS:
-      return p.ff_dim;
+      return p.dim;
     default:
       return std::nullopt;
   }

From 26ddf7fe99b3b51fe9e32b015c5691380d3efa48 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sat, 1 Jun 2024 16:19:49 -0700
Subject: [PATCH 22/24] Fix include

---
 .../include/{ => local-execution}/arg_ref.h      |  4 ++--
 .../include/{ => local-execution}/concrete_arg.h |  2 +-
 .../include/{ => local-execution}/config.h       |  0
 .../include/{ => local-execution}/cost_metrics.h |  0
 .../{ => local-execution}/device_specific.h      |  2 +-
 .../{ => local-execution}/legion_tensor_shape.h  |  0
 .../{ => local-execution}/local_allocator.h      |  0
 .../include/{ => local-execution}/op_arg_ref.h   |  4 ++--
 .../{ => local-execution}/op_task_invocation.h   | 16 ++++++++--------
 .../{ => local-execution}/op_task_signature.h    |  8 ++++----
 .../{ => local-execution}/op_tensor_spec.h       |  2 +-
 .../include/{ => local-execution}/permissions.h  |  0
 .../include/{ => local-execution}/profiling.h    |  0
 .../{ => local-execution}/runtime_arg_ref.h      |  6 +++---
 .../{ => local-execution}/serialization.h        |  0
 .../{ => local-execution}/sim_environment.h      |  6 +++---
 .../include/{ => local-execution}/slot_id.h      |  0
 .../include/{ => local-execution}/slot_type.h    |  0
 .../task_argument_accessor.h                     | 14 +++++++-------
 .../include/{ => local-execution}/tasks.h        |  0
 .../{ => local-execution}/tracked_allocator.h    |  2 +-
 .../{ => local-execution}/variadic_tensor_ref.h  |  4 ++--
 lib/local-execution/src/local_allocator.cc       |  2 +-
 lib/local-execution/src/op_arg_ref.cc            |  2 +-
 lib/local-execution/src/op_task_invocation.cc    |  2 +-
 lib/local-execution/src/op_task_signature.cc     |  2 +-
 lib/local-execution/src/ops/attention.cc         |  2 +-
 lib/local-execution/src/ops/attention.h          |  4 ++--
 lib/local-execution/src/ops/batch_matmul.cc      |  2 +-
 lib/local-execution/src/ops/batch_matmul.h       |  6 +++---
 lib/local-execution/src/ops/batch_norm.h         |  4 ++--
 lib/local-execution/src/ops/cast.cc              |  2 +-
 lib/local-execution/src/ops/cast.h               |  4 ++--
 lib/local-execution/src/ops/combine.cc           |  4 ++--
 lib/local-execution/src/ops/combine.h            |  4 ++--
 lib/local-execution/src/ops/concat.cc            |  4 ++--
 lib/local-execution/src/ops/concat.h             |  4 ++--
 lib/local-execution/src/ops/conv_2d.h            |  4 ++--
 lib/local-execution/src/ops/dropout.cc           |  4 ++--
 lib/local-execution/src/ops/dropout.h            |  6 +++---
 lib/local-execution/src/ops/element_binary.h     |  2 +-
 lib/local-execution/src/ops/element_unary.h      |  4 ++--
 lib/local-execution/src/ops/embedding.cc         |  2 +-
 lib/local-execution/src/ops/embedding.h          |  4 ++--
 lib/local-execution/src/ops/flat.cc              |  2 +-
 lib/local-execution/src/ops/flat.h               |  2 +-
 lib/local-execution/src/ops/gather.cc            |  4 ++--
 lib/local-execution/src/ops/gather.h             |  4 ++--
 lib/local-execution/src/ops/layer_norm.h         |  4 ++--
 lib/local-execution/src/ops/linear.cc            |  2 +-
 lib/local-execution/src/ops/linear.h             |  4 ++--
 lib/local-execution/src/ops/noop.cc              |  2 +-
 lib/local-execution/src/ops/noop.h               |  2 +-
 lib/local-execution/src/ops/pool_2d.h            |  4 ++--
 lib/local-execution/src/ops/reduce.h             |  4 ++--
 lib/local-execution/src/ops/reduction.h          |  4 ++--
 lib/local-execution/src/ops/repartition.h        |  4 ++--
 lib/local-execution/src/ops/replicate.h          |  4 ++--
 lib/local-execution/src/ops/reshape.h            |  4 ++--
 lib/local-execution/src/ops/reverse.h            |  4 ++--
 lib/local-execution/src/ops/softmax.h            |  4 ++--
 lib/local-execution/src/ops/split.h              |  4 ++--
 lib/local-execution/src/ops/topk.h               |  4 ++--
 lib/local-execution/src/ops/transpose.h          |  4 ++--
 lib/local-execution/src/permissions.cc           |  2 +-
 lib/local-execution/src/runtime_arg_ref.cc       |  4 ++--
 lib/local-execution/src/tracked_allocator.cc     |  2 +-
 lib/local-execution/src/variadic_tensor_ref.cc   |  2 +-
 68 files changed, 112 insertions(+), 112 deletions(-)
 rename lib/local-execution/include/{ => local-execution}/arg_ref.h (94%)
 rename lib/local-execution/include/{ => local-execution}/concrete_arg.h (96%)
 rename lib/local-execution/include/{ => local-execution}/config.h (100%)
 rename lib/local-execution/include/{ => local-execution}/cost_metrics.h (100%)
 rename lib/local-execution/include/{ => local-execution}/device_specific.h (96%)
 rename lib/local-execution/include/{ => local-execution}/legion_tensor_shape.h (100%)
 rename lib/local-execution/include/{ => local-execution}/local_allocator.h (100%)
 rename lib/local-execution/include/{ => local-execution}/op_arg_ref.h (86%)
 rename lib/local-execution/include/{ => local-execution}/op_task_invocation.h (88%)
 rename lib/local-execution/include/{ => local-execution}/op_task_signature.h (95%)
 rename lib/local-execution/include/{ => local-execution}/op_tensor_spec.h (89%)
 rename lib/local-execution/include/{ => local-execution}/permissions.h (100%)
 rename lib/local-execution/include/{ => local-execution}/profiling.h (100%)
 rename lib/local-execution/include/{ => local-execution}/runtime_arg_ref.h (83%)
 rename lib/local-execution/include/{ => local-execution}/serialization.h (100%)
 rename lib/local-execution/include/{ => local-execution}/sim_environment.h (96%)
 rename lib/local-execution/include/{ => local-execution}/slot_id.h (100%)
 rename lib/local-execution/include/{ => local-execution}/slot_type.h (100%)
 rename lib/local-execution/include/{ => local-execution}/task_argument_accessor.h (94%)
 rename lib/local-execution/include/{ => local-execution}/tasks.h (100%)
 rename lib/local-execution/include/{ => local-execution}/tracked_allocator.h (94%)
 rename lib/local-execution/include/{ => local-execution}/variadic_tensor_ref.h (81%)

diff --git a/lib/local-execution/include/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h
similarity index 94%
rename from lib/local-execution/include/arg_ref.h
rename to lib/local-execution/include/local-execution/arg_ref.h
index b0e2b57b05..50fe4e6f80 100644
--- a/lib/local-execution/include/arg_ref.h
+++ b/lib/local-execution/include/local-execution/arg_ref.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H
 
 #include "kernels/ff_handle.h"
-#include "profiling.h"
-#include "serialization.h"
+#include "local-execution/profiling.h"
+#include "local-execution/serialization.h"
 #include "utils/type_index.h"
 #include "utils/visitable.h"
 
diff --git a/lib/local-execution/include/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h
similarity index 96%
rename from lib/local-execution/include/concrete_arg.h
rename to lib/local-execution/include/local-execution/concrete_arg.h
index 072500f47e..2db5e45e9e 100644
--- a/lib/local-execution/include/concrete_arg.h
+++ b/lib/local-execution/include/local-execution/concrete_arg.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
 #define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
 
-#include "serialization.h"
+#include "local-execution/serialization.h"
 #include "utils/type_index.h"
 #include <memory>
 
diff --git a/lib/local-execution/include/config.h b/lib/local-execution/include/local-execution/config.h
similarity index 100%
rename from lib/local-execution/include/config.h
rename to lib/local-execution/include/local-execution/config.h
diff --git a/lib/local-execution/include/cost_metrics.h b/lib/local-execution/include/local-execution/cost_metrics.h
similarity index 100%
rename from lib/local-execution/include/cost_metrics.h
rename to lib/local-execution/include/local-execution/cost_metrics.h
diff --git a/lib/local-execution/include/device_specific.h b/lib/local-execution/include/local-execution/device_specific.h
similarity index 96%
rename from lib/local-execution/include/device_specific.h
rename to lib/local-execution/include/local-execution/device_specific.h
index a055f6d274..6136d16f2d 100644
--- a/lib/local-execution/include/device_specific.h
+++ b/lib/local-execution/include/local-execution/device_specific.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
 #define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
 
-#include "serialization.h"
+#include "local-execution/serialization.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h
similarity index 100%
rename from lib/local-execution/include/legion_tensor_shape.h
rename to lib/local-execution/include/local-execution/legion_tensor_shape.h
diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local-execution/local_allocator.h
similarity index 100%
rename from lib/local-execution/include/local_allocator.h
rename to lib/local-execution/include/local-execution/local_allocator.h
diff --git a/lib/local-execution/include/op_arg_ref.h b/lib/local-execution/include/local-execution/op_arg_ref.h
similarity index 86%
rename from lib/local-execution/include/op_arg_ref.h
rename to lib/local-execution/include/local-execution/op_arg_ref.h
index 577ac7984a..1650656b42 100644
--- a/lib/local-execution/include/op_arg_ref.h
+++ b/lib/local-execution/include/local-execution/op_arg_ref.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
 
-#include "arg_ref.h"
-#include "device_specific.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/device_specific.h"
 #include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h
similarity index 88%
rename from lib/local-execution/include/op_task_invocation.h
rename to lib/local-execution/include/local-execution/op_task_invocation.h
index 1bf94a1b0d..9783d1fe88 100644
--- a/lib/local-execution/include/op_task_invocation.h
+++ b/lib/local-execution/include/local-execution/op_task_invocation.h
@@ -1,17 +1,17 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 
-#include "concrete_arg.h"
+#include "local-execution/concrete_arg.h"
 #include "kernels/accessor.h"
-#include "op_arg_ref.h"
-#include "op_task_signature.h"
-#include "op_tensor_spec.h"
-#include "profiling.h"
-#include "runtime_arg_ref.h"
-#include "tasks.h"
+#include "local-execution/op_arg_ref.h"
+#include "local-execution/op_task_signature.h"
+#include "local-execution/op_tensor_spec.h"
+#include "local-execution/profiling.h"
+#include "local-execution/runtime_arg_ref.h"
+#include "local-execution/tasks.h"
 #include "utils/bidict.h"
 #include "utils/stack_map.h"
-#include "variadic_tensor_ref.h"
+#include "local-execution/variadic_tensor_ref.h"
 #include <typeindex>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/local-execution/op_task_signature.h
similarity index 95%
rename from lib/local-execution/include/op_task_signature.h
rename to lib/local-execution/include/local-execution/op_task_signature.h
index 840c321627..3bcb8397b7 100644
--- a/lib/local-execution/include/op_task_signature.h
+++ b/lib/local-execution/include/local-execution/op_task_signature.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
 
-#include "serialization.h"
-#include "slot_id.h"
-#include "slot_type.h"
-#include "tasks.h"
+#include "local-execution/serialization.h"
+#include "local-execution/slot_id.h"
+#include "local-execution/slot_type.h"
+#include "local-execution/tasks.h"
 #include "utils/type_index.h"
 #include "utils/visitable.h"
 
diff --git a/lib/local-execution/include/op_tensor_spec.h b/lib/local-execution/include/local-execution/op_tensor_spec.h
similarity index 89%
rename from lib/local-execution/include/op_tensor_spec.h
rename to lib/local-execution/include/local-execution/op_tensor_spec.h
index c12b5342e1..cc2cd75153 100644
--- a/lib/local-execution/include/op_tensor_spec.h
+++ b/lib/local-execution/include/local-execution/op_tensor_spec.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 
-#include "op_task_signature.h"
+#include "local-execution/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/permissions.h b/lib/local-execution/include/local-execution/permissions.h
similarity index 100%
rename from lib/local-execution/include/permissions.h
rename to lib/local-execution/include/local-execution/permissions.h
diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/local-execution/profiling.h
similarity index 100%
rename from lib/local-execution/include/profiling.h
rename to lib/local-execution/include/local-execution/profiling.h
diff --git a/lib/local-execution/include/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h
similarity index 83%
rename from lib/local-execution/include/runtime_arg_ref.h
rename to lib/local-execution/include/local-execution/runtime_arg_ref.h
index 05afa456cf..295f32455c 100644
--- a/lib/local-execution/include/runtime_arg_ref.h
+++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 
-#include "arg_ref.h"
-#include "config.h"
-#include "device_specific.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/config.h"
+#include "local-execution/device_specific.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/serialization.h b/lib/local-execution/include/local-execution/serialization.h
similarity index 100%
rename from lib/local-execution/include/serialization.h
rename to lib/local-execution/include/local-execution/serialization.h
diff --git a/lib/local-execution/include/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h
similarity index 96%
rename from lib/local-execution/include/sim_environment.h
rename to lib/local-execution/include/local-execution/sim_environment.h
index 4409ab8b55..efcc41d58b 100644
--- a/lib/local-execution/include/sim_environment.h
+++ b/lib/local-execution/include/local-execution/sim_environment.h
@@ -1,13 +1,13 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
 #define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
 
-#include "cost_metrics.h"
+#include "local-execution/cost_metrics.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
 #include "pcg/machine_view.h"
-#include "task_argument_accessor.h"
+#include "local-execution/task_argument_accessor.h"
 #include <vector>
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/slot_id.h b/lib/local-execution/include/local-execution/slot_id.h
similarity index 100%
rename from lib/local-execution/include/slot_id.h
rename to lib/local-execution/include/local-execution/slot_id.h
diff --git a/lib/local-execution/include/slot_type.h b/lib/local-execution/include/local-execution/slot_type.h
similarity index 100%
rename from lib/local-execution/include/slot_type.h
rename to lib/local-execution/include/local-execution/slot_type.h
diff --git a/lib/local-execution/include/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
similarity index 94%
rename from lib/local-execution/include/task_argument_accessor.h
rename to lib/local-execution/include/local-execution/task_argument_accessor.h
index 0656af0fe3..df0637142a 100644
--- a/lib/local-execution/include/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,17 +1,17 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
-#include "arg_ref.h"
-#include "concrete_arg.h"
-#include "config.h"
-#include "device_specific.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/concrete_arg.h"
+#include "local-execution/config.h"
+#include "local-execution/device_specific.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "kernels/linear_kernels.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "op_task_signature.h"
-#include "permissions.h"
-#include "tasks.h"
+#include "local-execution/op_task_signature.h"
+#include "local-execution/permissions.h"
+#include "local-execution/tasks.h"
 #include "utils/variant.h"
 #include <cstddef>
 #include <memory>
diff --git a/lib/local-execution/include/tasks.h b/lib/local-execution/include/local-execution/tasks.h
similarity index 100%
rename from lib/local-execution/include/tasks.h
rename to lib/local-execution/include/local-execution/tasks.h
diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
similarity index 94%
rename from lib/local-execution/include/tracked_allocator.h
rename to lib/local-execution/include/local-execution/tracked_allocator.h
index 4f51670426..ea3eec64e0 100644
--- a/lib/local-execution/include/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 
 #include "kernels/allocation.h"
-#include "local_allocator.h"
+#include "local-execution/local_allocator.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/variadic_tensor_ref.h b/lib/local-execution/include/local-execution/variadic_tensor_ref.h
similarity index 81%
rename from lib/local-execution/include/variadic_tensor_ref.h
rename to lib/local-execution/include/local-execution/variadic_tensor_ref.h
index 091c55b0af..56da1bab64 100644
--- a/lib/local-execution/include/variadic_tensor_ref.h
+++ b/lib/local-execution/include/local-execution/variadic_tensor_ref.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
 
-#include "arg_ref.h"
-#include "op_tensor_spec.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/op_tensor_spec.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/local_allocator.cc b/lib/local-execution/src/local_allocator.cc
index 0bb7d04574..d393643ead 100644
--- a/lib/local-execution/src/local_allocator.cc
+++ b/lib/local-execution/src/local_allocator.cc
@@ -1,4 +1,4 @@
-#include "local_allocator.h"
+#include "local-execution/local_allocator.h"
 #include "kernels/device.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc
index 6bea26a5a2..8e9b56272b 100644
--- a/lib/local-execution/src/op_arg_ref.cc
+++ b/lib/local-execution/src/op_arg_ref.cc
@@ -1,4 +1,4 @@
-#include "op_arg_ref.h"
+#include "local-execution/op_arg_ref.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 5683cb12ec..adad2f3a72 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -1,4 +1,4 @@
-#include "op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
index 71642680a6..53a685910e 100644
--- a/lib/local-execution/src/op_task_signature.cc
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -1,4 +1,4 @@
-#include "op_task_signature.h"
+#include "local-execution/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index 3f11829d2f..6e6d23cd4a 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -15,7 +15,7 @@
 
 #include "attention.h"
 #include "kernels/attention_kernels.h"
-#include "op_task_signature.h"
+#include "local-execution/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/attention.h b/lib/local-execution/src/ops/attention.h
index 601d8a4796..c8eb17ecec 100644
--- a/lib/local-execution/src/ops/attention.h
+++ b/lib/local-execution/src/ops/attention.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_ATTENTION_H
 #define _FLEXFLOW_ATTENTION_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/attention.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index 76bc88eae6..187e97ecaa 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -15,9 +15,9 @@
 
 #include "batch_matmul.h"
 #include "kernels/batch_matmul_kernels.h"
+#include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/batch_matmul.h"
-#include "op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h
index 6791b11a8c..94457c22be 100644
--- a/lib/local-execution/src/ops/batch_matmul.h
+++ b/lib/local-execution/src/ops/batch_matmul.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_BATCH_MATMUL_H
 #define _FLEXFLOW_BATCH_MATMUL_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/op_task_signature.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/batch_matmul.h"
-#include "op_task_invocation.h"
-#include "op_task_signature.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/batch_norm.h b/lib/local-execution/src/ops/batch_norm.h
index 6fae871c2c..1745a5cac8 100644
--- a/lib/local-execution/src/ops/batch_norm.h
+++ b/lib/local-execution/src/ops/batch_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_BATCH_NORM_H
 #define _FLEXFLOW_BATCH_NORM_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/batch_norm.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 7a74c8824d..9e1f777d73 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -16,7 +16,7 @@
 #include "cast.h"
 #include "kernels/cast_kernels.h"
 
-#include "op_task_signature.h"
+#include "local-execution/op_task_signature.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Cast;
diff --git a/lib/local-execution/src/ops/cast.h b/lib/local-execution/src/ops/cast.h
index ce9a93aa32..69aeadf497 100644
--- a/lib/local-execution/src/ops/cast.h
+++ b/lib/local-execution/src/ops/cast.h
@@ -15,9 +15,9 @@
 #ifndef _FLEXFLOW_CAST_H
 #define _FLEXFLOW_CAST_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/cast.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
index a39a503333..6df09b53f4 100644
--- a/lib/local-execution/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -15,7 +15,7 @@
 
 #include "combine.h"
 #include "kernels/combine_kernels.h"
-#include "op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
@@ -64,7 +64,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Combine] forward_time = {:.2lf}ms\n",
+                 "[Combine] backward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad);
 }
diff --git a/lib/local-execution/src/ops/combine.h b/lib/local-execution/src/ops/combine.h
index 5923e9ebcc..f9349a01ef 100644
--- a/lib/local-execution/src/ops/combine.h
+++ b/lib/local-execution/src/ops/combine.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_COMBINE_H
 #define _FLEXFLOW_COMBINE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/combine.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
index 3cbc232fac..f3c2eba48f 100644
--- a/lib/local-execution/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -16,10 +16,10 @@
 #include "concat.h"
 #include "kernels/concat_kernels.h"
 
+#include "local-execution/op_task_signature.h"
+#include "local-execution/variadic_tensor_ref.h"
 #include "op-attrs/get_output_shapes.h"
-#include "op_task_signature.h"
 #include "utils/hash-utils.h"
-#include "variadic_tensor_ref.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/concat.h b/lib/local-execution/src/ops/concat.h
index d0a432e8b3..fa61d87e77 100644
--- a/lib/local-execution/src/ops/concat.h
+++ b/lib/local-execution/src/ops/concat.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONCAT_H
 #define _FLEXFLOW_CONCAT_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/concat.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/conv_2d.h b/lib/local-execution/src/ops/conv_2d.h
index 0e92b00553..0c8181adce 100644
--- a/lib/local-execution/src/ops/conv_2d.h
+++ b/lib/local-execution/src/ops/conv_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONV_2D_H
 #define _FLEXFLOW_CONV_2D_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/conv_2d.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
index 3db1e7b8eb..9d680054ea 100644
--- a/lib/local-execution/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -1,8 +1,8 @@
 #include "dropout.h"
 #include "kernels/dropout_kernels.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
-#include "op_task_invocation.h"
-#include "op_task_signature.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/ops/dropout.h b/lib/local-execution/src/ops/dropout.h
index 4f22842c8a..53fbeb3857 100644
--- a/lib/local-execution/src/ops/dropout.h
+++ b/lib/local-execution/src/ops/dropout.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_DROPOUT_H
 #define _FLEXFLOW_DROPOUT_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
+#include "local-execution/tasks.h"
 #include "op-attrs/ops/dropout.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
-#include "tasks.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/element_binary.h b/lib/local-execution/src/ops/element_binary.h
index 342909c468..fa4202dffd 100644
--- a/lib/local-execution/src/ops/element_binary.h
+++ b/lib/local-execution/src/ops/element_binary.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_ELEMENT_BINARY_H
 #define _FLEXFLOW_ELEMENT_BINARY_H
 
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/element_binary.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/element_unary.h b/lib/local-execution/src/ops/element_unary.h
index 83f6177b8d..e0f58e8a75 100644
--- a/lib/local-execution/src/ops/element_unary.h
+++ b/lib/local-execution/src/ops/element_unary.h
@@ -1,9 +1,9 @@
 #ifndef _ELEMENT_UNARY_H
 #define _ELEMENT_UNARY_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/element_unary.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc
index 6ce13d88c9..27d667cd00 100644
--- a/lib/local-execution/src/ops/embedding.cc
+++ b/lib/local-execution/src/ops/embedding.cc
@@ -53,7 +53,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Embedding] forward_time = {:.2lf}ms\n",
+                 "[Embedding] backward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight,
diff --git a/lib/local-execution/src/ops/embedding.h b/lib/local-execution/src/ops/embedding.h
index b4caebf952..c33b1161bf 100644
--- a/lib/local-execution/src/ops/embedding.h
+++ b/lib/local-execution/src/ops/embedding.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_EMBEDDING_H
 #define _FLEXFLOW_EMBEDDING_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/embedding.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 194d84aaa8..3c2499da79 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -48,7 +48,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Flat] forward_time = {:.2lf}ms\n",
+                 "[Flat] backward_time = {:.2lf}ms\n",
                  input,
                  input_grad.get_float_ptr(),
                  output_grad.get_float_ptr());
diff --git a/lib/local-execution/src/ops/flat.h b/lib/local-execution/src/ops/flat.h
index 13246028fb..d9ea4d3985 100644
--- a/lib/local-execution/src/ops/flat.h
+++ b/lib/local-execution/src/ops/flat.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_FLAT_H
 #define _FLEXFLOW_FLAT_H
 
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/flat.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index 091abd0ed3..deb436842a 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -15,7 +15,7 @@
 
 #include "gather.h"
 #include "kernels/gather_kernels.h"
-#include "legion_tensor_shape.h"
+#include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
 #include <optional>
 
@@ -109,7 +109,7 @@ static std::optional<float>
   auto index = acc.get_tensor<Permissions::RO>(INDEX);
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
 
-  return profile(forward_kernel,
+  return profile(backward_kernel,
                  profiling,
                  "[Gather] forward_time = {:.2lf}ms\n",
                  per_device_state,
diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/src/ops/gather.h
index e83f768cb7..e2de09d96a 100644
--- a/lib/local-execution/src/ops/gather.h
+++ b/lib/local-execution/src/ops/gather.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_GATHER_H
 #define _FLEXFLOW_GATHER_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/gather.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/layer_norm.h b/lib/local-execution/src/ops/layer_norm.h
index 83e6733bf6..4eadb9ff09 100644
--- a/lib/local-execution/src/ops/layer_norm.h
+++ b/lib/local-execution/src/ops/layer_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/layer_norm.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 36533be211..e2c9d9aef4 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -1,8 +1,8 @@
 #include "linear.h"
 #include "kernels/linear_kernels.h"
+#include "local-execution/task_argument_accessor.h"
 #include "op-attrs/ff_dim.h"
 #include "op-attrs/get_output_shapes.h"
-#include "task_argument_accessor.h"
 #include "utils/exception.h"
 #include "utils/graph/views.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/ops/linear.h b/lib/local-execution/src/ops/linear.h
index 2b476382ef..2ff9016114 100644
--- a/lib/local-execution/src/ops/linear.h
+++ b/lib/local-execution/src/ops/linear.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LINEAR_H
 #define _FLEXFLOW_LINEAR_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/linear.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc
index 02ffeaf111..168d547c17 100644
--- a/lib/local-execution/src/ops/noop.cc
+++ b/lib/local-execution/src/ops/noop.cc
@@ -14,7 +14,7 @@
  */
 
 #include "noop.h"
-#include "op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/ops/noop.h b/lib/local-execution/src/ops/noop.h
index 17a9426e77..fab2cf1f86 100644
--- a/lib/local-execution/src/ops/noop.h
+++ b/lib/local-execution/src/ops/noop.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_NOOP_H
 #define _FLEXFLOW_NOOP_H
 
+#include "local-execution/op_task_invocation.h"
 #include "op-attrs/ops/input.h"
 #include "op-attrs/ops/noop.h"
-#include "op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/pool_2d.h b/lib/local-execution/src/ops/pool_2d.h
index 852110e2e2..0537e9f1c4 100644
--- a/lib/local-execution/src/ops/pool_2d.h
+++ b/lib/local-execution/src/ops/pool_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_POOL_2D_H
 #define _FLEXFLOW_POOL_2D_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/pool_2d.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/reduce.h b/lib/local-execution/src/ops/reduce.h
index 4c22a9127e..6d47ec2f4d 100644
--- a/lib/local-execution/src/ops/reduce.h
+++ b/lib/local-execution/src/ops/reduce.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reduce.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/reduction.h b/lib/local-execution/src/ops/reduction.h
index 071c4d2a7b..a69b75f310 100644
--- a/lib/local-execution/src/ops/reduction.h
+++ b/lib/local-execution/src/ops/reduction.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REDUCTION_H
 #define _FLEXFLOW_REDUCTION_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reduction.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/repartition.h b/lib/local-execution/src/ops/repartition.h
index 0c8cdaf0f9..a73bd3f808 100644
--- a/lib/local-execution/src/ops/repartition.h
+++ b/lib/local-execution/src/ops/repartition.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_PARTITION_H
 #define _FLEXFLOW_PARTITION_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/repartition.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/replicate.h b/lib/local-execution/src/ops/replicate.h
index 510676931b..339f805f2c 100644
--- a/lib/local-execution/src/ops/replicate.h
+++ b/lib/local-execution/src/ops/replicate.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REPLICATE_H
 #define _FLEXFLOW_REPLICATE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/replicate.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/reshape.h b/lib/local-execution/src/ops/reshape.h
index 0b845de5fc..14b22561a0 100644
--- a/lib/local-execution/src/ops/reshape.h
+++ b/lib/local-execution/src/ops/reshape.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RESHAPE_H
 #define _FLEXFLOW_RESHAPE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reshape.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/reverse.h b/lib/local-execution/src/ops/reverse.h
index 68545644bd..5be501698c 100644
--- a/lib/local-execution/src/ops/reverse.h
+++ b/lib/local-execution/src/ops/reverse.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REVERSE_H_
 #define _FLEXFLOW_REVERSE_H_
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reverse.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/softmax.h b/lib/local-execution/src/ops/softmax.h
index 8fe2f96eb5..a83d8f4116 100644
--- a/lib/local-execution/src/ops/softmax.h
+++ b/lib/local-execution/src/ops/softmax.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SOFTMAX_H
 #define _FLEXFLOW_SOFTMAX_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/softmax.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/split.h b/lib/local-execution/src/ops/split.h
index 1fdfdc2432..f51e0ea6af 100644
--- a/lib/local-execution/src/ops/split.h
+++ b/lib/local-execution/src/ops/split.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SPLIT_H
 #define _FLEXFLOW_SPLIT_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/split.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/topk.h b/lib/local-execution/src/ops/topk.h
index fcab2a5a31..db85fd9d03 100644
--- a/lib/local-execution/src/ops/topk.h
+++ b/lib/local-execution/src/ops/topk.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TOPK_H_
 #define _FLEXFLOW_TOPK_H_
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/topk.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h
index 6c6dffdc8a..daa64e8e59 100644
--- a/lib/local-execution/src/ops/transpose.h
+++ b/lib/local-execution/src/ops/transpose.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TRANSPOSE_H_
 #define _FLEXFLOW_TRANSPOSE_H_
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/transpose.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/permissions.cc b/lib/local-execution/src/permissions.cc
index 2843dd1b70..e5c46b42f8 100644
--- a/lib/local-execution/src/permissions.cc
+++ b/lib/local-execution/src/permissions.cc
@@ -1,4 +1,4 @@
-#include "permissions.h"
+#include "local-execution/permissions.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc
index a9d573bbb5..df4f024f1d 100644
--- a/lib/local-execution/src/runtime_arg_ref.cc
+++ b/lib/local-execution/src/runtime_arg_ref.cc
@@ -1,5 +1,5 @@
-#include "runtime_arg_ref.h"
-#include "device_specific.h"
+#include "local-execution/runtime_arg_ref.h"
+#include "local-execution/device_specific.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index 6d06714252..68636906c3 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -1,4 +1,4 @@
-#include "tracked_allocator.h"
+#include "local-execution/tracked_allocator.h"
 #include "kernels/device.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc
index 74d0f0d9e7..efd43a6648 100644
--- a/lib/local-execution/src/variadic_tensor_ref.cc
+++ b/lib/local-execution/src/variadic_tensor_ref.cc
@@ -1,4 +1,4 @@
-#include "variadic_tensor_ref.h"
+#include "local-execution/variadic_tensor_ref.h"
 
 namespace FlexFlow {
 

From 1dfc24e1653b4e139839af968b8266b674bcdb9e Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sat, 1 Jun 2024 16:22:42 -0700
Subject: [PATCH 23/24] Gather backward time

---
 lib/local-execution/src/ops/gather.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index deb436842a..50b27d72a6 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -111,7 +111,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Gather] forward_time = {:.2lf}ms\n",
+                 "[Gather] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad,
                  index,

From c60efd91b29378c1bae9c88787a25abe4b651993 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sat, 1 Jun 2024 16:25:17 -0700
Subject: [PATCH 24/24] Format

---
 .../include/local-execution/op_task_invocation.h          | 4 ++--
 .../include/local-execution/sim_environment.h             | 6 +++---
 .../include/local-execution/task_argument_accessor.h      | 8 ++++----
 lib/local-execution/src/ops/embedding.cc                  | 4 ++--
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h
index 9783d1fe88..37ca5c239d 100644
--- a/lib/local-execution/include/local-execution/op_task_invocation.h
+++ b/lib/local-execution/include/local-execution/op_task_invocation.h
@@ -1,17 +1,17 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 
-#include "local-execution/concrete_arg.h"
 #include "kernels/accessor.h"
+#include "local-execution/concrete_arg.h"
 #include "local-execution/op_arg_ref.h"
 #include "local-execution/op_task_signature.h"
 #include "local-execution/op_tensor_spec.h"
 #include "local-execution/profiling.h"
 #include "local-execution/runtime_arg_ref.h"
 #include "local-execution/tasks.h"
+#include "local-execution/variadic_tensor_ref.h"
 #include "utils/bidict.h"
 #include "utils/stack_map.h"
-#include "local-execution/variadic_tensor_ref.h"
 #include <typeindex>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/lib/local-execution/include/local-execution/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h
index efcc41d58b..78608a3228 100644
--- a/lib/local-execution/include/local-execution/sim_environment.h
+++ b/lib/local-execution/include/local-execution/sim_environment.h
@@ -1,13 +1,13 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
 #define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
 
-#include "local-execution/cost_metrics.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
-#include "op-attrs/parallel_tensor_shape.h"
+#include "local-execution/cost_metrics.h"
 #include "local-execution/op_task_invocation.h"
-#include "pcg/machine_view.h"
 #include "local-execution/task_argument_accessor.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/machine_view.h"
 #include <vector>
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index df0637142a..663c862e18 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,17 +1,17 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/linear_kernels.h"
 #include "local-execution/arg_ref.h"
 #include "local-execution/concrete_arg.h"
 #include "local-execution/config.h"
 #include "local-execution/device_specific.h"
-#include "kernels/accessor.h"
-#include "kernels/allocation.h"
-#include "kernels/linear_kernels.h"
-#include "op-attrs/parallel_tensor_shape.h"
 #include "local-execution/op_task_signature.h"
 #include "local-execution/permissions.h"
 #include "local-execution/tasks.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "utils/variant.h"
 #include <cstddef>
 #include <memory>
diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc
index 27d667cd00..00d6d033d4 100644
--- a/lib/local-execution/src/ops/embedding.cc
+++ b/lib/local-execution/src/ops/embedding.cc
@@ -53,7 +53,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Embedding] backward_time = {:.2lf}ms\n",
+                 "[Embedding] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight,
@@ -76,7 +76,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
-                 "[Embedding] forward_time = {:.2lf}ms\n",
+                 "[Embedding] backward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight_grad,