diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 36796bc504..15f14f8757 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -42,7 +42,7 @@ struct ArrayShape {
 
   ArrayShape reversed_dim_order() const;
   ArrayShape sub_shape(std::optional<legion_dim_t> start,
-                       std::optional<legion_dim_t> end);
+                       std::optional<legion_dim_t> end) const;
 
 public:
   LegionTensorDims dims;
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 17e0048c65..dedfbb01ef 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -9,9 +9,6 @@
 
 namespace FlexFlow {
 
-using ElementUnaryUnifiedAttrs =
-    std::variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;
-
 struct ElementUnaryPerDeviceState {
   ffTensorDescriptor_t inputTensor, outputTensor;
   req<ffActivationDescriptor_t> actiDesc;
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index c74f9c0bb6..13bf4b898a 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -2,36 +2,34 @@
 #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H
 
 #include "accessor.h"
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
 struct GatherPerDeviceState {
-  int legion_dim;
-  req<DataType> index_data_type;
+  PerDeviceFFHandle handle;
+  legion_dim_t legion_dim;
 };
+
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
-                                             legion_dim,
-                                             index_data_type);
+                                             handle,
+                                             legion_dim);
 
 namespace Kernels {
 namespace Gather {
+
 void forward_kernel(ffStream_t stream,
                     GatherPerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
-                    GenericTensorAccessorW const &output,
-                    size_t stride,
-                    size_t input_dim_size,
-                    size_t output_dim_size);
+                    GenericTensorAccessorW const &output);
+
 void backward_kernel(ffStream_t stream,
                      GatherPerDeviceState const &m,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
-                     GenericTensorAccessorW const &input_grad,
-                     size_t stride,
-                     size_t input_dim_size,
-                     size_t output_dim_size);
+                     GenericTensorAccessorW const &input_grad);
+
 } // namespace Gather
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 37d0112eab..286acf7376 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -25,10 +25,10 @@ template <typename IndexType>
 __global__ void gather_forward(float const *input,
                                IndexType const *index,
                                float *output,
-                               size_t output_size,
-                               size_t stride,
-                               size_t input_dim_size,
-                               size_t output_dim_size) {
+                               coord_t output_size,
+                               coord_t stride,
+                               coord_t input_dim_size,
+                               coord_t output_dim_size) {
   CUDA_KERNEL_LOOP(o, output_size) {
     // output tensor shape: [*, output_dim_size, stride]
     // output tensor stride: [output_dim_size * stride, stride, 1]
@@ -39,10 +39,10 @@ __global__ void gather_forward(float const *input,
     // [outer_index, index[0], left_over]
     // Therefore, input_index = outer_index * (stride * input_dim_size)
     //                        + index[0] * stride + left_over;
-    size_t outer_index = o / (stride * output_dim_size);
+    coord_t outer_index = o / (stride * output_dim_size);
     // coord_t index_2 = (o / stride) % dim_size
-    size_t left_over = o % stride;
-    size_t input_idx =
+    coord_t left_over = o % stride;
+    coord_t input_idx =
         outer_index * (stride * input_dim_size) + index[o] * stride + left_over;
     output[o] = input[input_idx];
   }
@@ -52,10 +52,10 @@ template <typename IndexType>
 __global__ void gather_backward(float const *output_grad,
                                 IndexType const *index,
                                 float *input_grad,
-                                size_t output_size,
-                                size_t stride,
-                                size_t input_dim_size,
-                                size_t output_dim_size) {
+                                coord_t output_size,
+                                coord_t stride,
+                                coord_t input_dim_size,
+                                coord_t output_dim_size) {
   CUDA_KERNEL_LOOP(o, output_size) {
     // output tensor shape: [*, output_dim_size, stride]
     // output tensor stride: [output_dim_size * stride, stride, 1]
@@ -66,10 +66,10 @@ __global__ void gather_backward(float const *output_grad,
     // [outer_index, index[0], left_over]
     // Therefore, input_index = outer_index * (stride * input_dim_size)
     //                        + index[0] * stride + left_over;
-    size_t outer_index = o / (stride * output_dim_size);
+    coord_t outer_index = o / (stride * output_dim_size);
     // coord_t index_2 = (o / stride) % dim_size
-    size_t left_over = o % stride;
-    size_t input_idx =
+    coord_t left_over = o % stride;
+    coord_t input_idx =
         outer_index * (stride * input_dim_size) + index[o] * stride + left_over;
 
     atomicAdd(&input_grad[input_idx], output_grad[o]);
@@ -78,100 +78,97 @@ __global__ void gather_backward(float const *output_grad,
 
 template <DataType IndexType>
 struct ForwardKernel {
-  void operator()(cudaStream_t stream,
-                  GatherPerDeviceState const &m,
+  void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorR const &index,
                   GenericTensorAccessorW const &output,
-                  size_t stride,
-                  size_t input_dim_size,
-                  size_t output_dim_size) {
-    /*size_t stride = 1;
-    for (int i = 0; i < m->legion_dim; i++) {
-      stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1);
-    }
-    size_t dim_size =
-        output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] +
-    1;
-*/
-    gather_forward<real_type<IndexType>>
-        <<<GET_BLOCKS(output.shape.get_volume()),
-           CUDA_NUM_THREADS,
-           0,
-           stream>>>(input.get<DataType::FLOAT>(),
-                     index.get<IndexType>(),
-                     output.get<DataType::FLOAT>(),
-                     output.shape.get_volume(),
-                     stride,
-                     input_dim_size,
-                     output_dim_size);
+                  coord_t output_size,
+                  coord_t stride,
+                  coord_t input_dim_size,
+                  coord_t output_dim_size) {
+    gather_forward<<<GET_BLOCKS(output_size), CUDA_NUM_THREADS, 0, stream>>>(
+        input.get_float_ptr(),
+        index.get<IndexType>(),
+        output.get_float_ptr(),
+        output_size,
+        stride,
+        input_dim_size,
+        output_dim_size);
   }
 };
 
-void forward_kernel(cudaStream_t stream,
+template <DataType IndexType>
+struct BackwardKernel {
+  void operator()(ffStream_t stream,
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &index,
+                  GenericTensorAccessorW const &input_grad,
+                  coord_t output_size,
+                  coord_t stride,
+                  coord_t input_dim_size,
+                  coord_t output_dim_size) {
+    gather_backward<<<GET_BLOCKS(output_size), CUDA_NUM_THREADS, 0, stream>>>(
+        output_grad.get_float_ptr(),
+        index.get<IndexType>(),
+        input_grad.get_float_ptr(),
+        output_size,
+        stride,
+        input_dim_size,
+        output_dim_size);
+  }
+};
+
+void forward_kernel(ffStream_t stream,
                     GatherPerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
-                    GenericTensorAccessorW const &output,
-                    size_t stride,
-                    size_t input_dim_size,
-                    size_t output_dim_size) {
-  DataTypeDispatch1<ForwardKernel>{}(m.index_data_type,
+                    GenericTensorAccessorW const &output) {
+  checkCUDA(get_legion_stream(&stream));
+
+  coord_t stride =
+      output.shape
+          .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1})
+          .get_volume();
+  coord_t output_dim_size = output.shape[m.legion_dim];
+  coord_t input_dim_size = input.shape[m.legion_dim];
+
+  assert(index.data_type == DataType::INT32 ||
+         index.data_type == DataType::INT64);
+
+  DataTypeDispatch1<ForwardKernel>{}(index.data_type,
                                      stream,
-                                     m,
                                      input,
                                      index,
                                      output,
+                                     output.shape.get_volume(),
                                      stride,
                                      input_dim_size,
                                      output_dim_size);
 }
 
-template <DataType IndexType>
-struct BackwardKernel {
-  void operator()(cudaStream_t stream,
-                  GatherPerDeviceState const &m,
-                  GenericTensorAccessorR const &output_grad,
-                  GenericTensorAccessorR const &index,
-                  GenericTensorAccessorW const &input_grad,
-                  size_t stride,
-                  size_t input_dim_size,
-                  size_t output_dim_size) {
-    /*size_t stride = 1;
-    for (int i = 0; i < m->legion_dim; i++) {
-      stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1);
-    }
-    size_t dim_size = output_grad.domain.hi()[m->legion_dim] -
-                      output_grad.domain.lo()[m->legion_dim] + 1;
-    */
-    gather_backward<real_type<IndexType>>
-        <<<GET_BLOCKS(output_grad.shape.get_volume()),
-           CUDA_NUM_THREADS,
-           0,
-           stream>>>(output_grad.get<DataType::FLOAT>(),
-                     index.get<IndexType>(),
-                     input_grad.get<DataType::FLOAT>(),
-                     output_grad.shape.get_volume(),
-                     stride,
-                     input_dim_size,
-                     output_dim_size);
-  }
-};
-
-void backward_kernel(cudaStream_t stream,
+void backward_kernel(ffStream_t stream,
                      GatherPerDeviceState const &m,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
-                     GenericTensorAccessorW const &input_grad,
-                     size_t stride,
-                     size_t input_dim_size,
-                     size_t output_dim_size) {
-  DataTypeDispatch1<BackwardKernel>{}(m.index_data_type,
+                     GenericTensorAccessorW const &input_grad) {
+  checkCUDA(get_legion_stream(&stream));
+
+  coord_t stride =
+      output_grad.shape
+          .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1})
+          .get_volume();
+  coord_t output_dim_size = output_grad.shape[m.legion_dim];
+  coord_t input_dim_size = input_grad.shape[m.legion_dim];
+
+  assert(index.data_type == DataType::INT32 ||
+         index.data_type == DataType::INT64);
+
+  DataTypeDispatch1<BackwardKernel>{}(index.data_type,
                                       stream,
-                                      m,
                                       output_grad,
                                       index,
                                       input_grad,
+                                      output_grad.shape.get_volume(),
                                       stride,
                                       input_dim_size,
                                       output_dim_size);
diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt
index ee1d8fecdc..6b432fad75 100644
--- a/lib/local-execution/CMakeLists.txt
+++ b/lib/local-execution/CMakeLists.txt
@@ -12,4 +12,5 @@ ff_add_library(
     utils
     kernels
     pcg
+    spdlog
 )
\ No newline at end of file
diff --git a/lib/runtime/src/task_spec/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h
similarity index 50%
rename from lib/runtime/src/task_spec/arg_ref.h
rename to lib/local-execution/include/local-execution/arg_ref.h
index 62f89f0b5c..50fe4e6f80 100644
--- a/lib/runtime/src/task_spec/arg_ref.h
+++ b/lib/local-execution/include/local-execution/arg_ref.h
@@ -1,9 +1,9 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_ARG_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_ARG_REF_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H
 
 #include "kernels/ff_handle.h"
-#include "runtime/profiling.h"
-#include "runtime/task_spec/arg_type_runtime_tag.h"
+#include "local-execution/profiling.h"
+#include "local-execution/serialization.h"
 #include "utils/type_index.h"
 #include "utils/visitable.h"
 
@@ -21,37 +21,43 @@ struct ArgRefSpec {
 
   template <typename T>
   bool holds() const {
-    return this->type_tag.template matches<T>();
+    // return this->type_tag.template matches<T>();
+
+    return matches<T>(this->type_idx);
   }
 
   LABEL_TYPE const &get_ref_type() const {
     return this->ref_type;
   }
 
-  ArgTypeRuntimeTag get_type_tag() const {
-    return this->type_tag;
+  // TODO - how to extend this for legion runtime?
+  // ArgTypeRuntimeTag get_type_tag() const {
+  //   return this->type_tag;
+  // }
+  std::type_index get_type_index() const {
+    return this->type_idx;
   }
 
   template <typename T>
   static ArgRefSpec create(ArgRef<LABEL_TYPE, T> const &r) {
     static_assert(is_serializable<T>::value, "Type must be serializeable");
 
-    return ArgRefSpec(ArgTypeRuntimeTag::create<T>(), r.ref_type);
+    return ArgRefSpec(get_type_index_for_type<T>(), r.ref_type);
   }
 
   template <typename T>
   static ArgRefSpec create_device_specific(ArgRef<LABEL_TYPE, T> const &r,
                                            size_t device_idx) {
-    return ArgRefSpec(ArgTypeRuntimeTag::create<T>(), r.ref_type, device_idx);
+    return ArgRefSpec(get_type_index_for_type<T>(), r.ref_type, device_idx);
   }
 
 private:
-  ArgRefSpec(ArgTypeRuntimeTag const &type_tag, LABEL_TYPE ref_type)
-      : type_tag(type_tag), ref_type(ref_type) {}
+  ArgRefSpec(std::type_index const &type_index, LABEL_TYPE ref_type)
+      : type_idx(type_index), ref_type(ref_type) {}
 
-  ArgTypeRuntimeTag type_tag;
+  std::type_index type_idx;
   LABEL_TYPE ref_type;
-  optional<size_t> device_idx = nullopt;
+  std::optional<size_t> device_idx = std::nullopt;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h
new file mode 100644
index 0000000000..2db5e45e9e
--- /dev/null
+++ b/lib/local-execution/include/local-execution/concrete_arg.h
@@ -0,0 +1,55 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
+#define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
+
+#include "local-execution/serialization.h"
+#include "utils/type_index.h"
+#include <memory>
+
+namespace FlexFlow {
+
+struct ConcreteArgSpec {
+public:
+  ConcreteArgSpec() = delete;
+
+  template <typename T>
+  T const &get() const {
+    assert(matches<T>(this->type_idx));
+
+    return *(T const *)ptr.get();
+  }
+
+  // ArgTypeRuntimeTag get_type_tag() const {
+  //   return this->type_tag;
+  // }
+  // size_t serialize(Legion::Serializer &) const;
+
+  std::type_index get_type_index() const {
+    return this->type_idx;
+  }
+
+  template <typename T>
+  static ConcreteArgSpec create(T const &t) {
+    static_assert(is_serializable<T>::value, "Type must be serializable");
+
+    std::type_index type_idx = get_type_index_for_type<T>();
+    std::shared_ptr<void const> ptr =
+        std::static_pointer_cast<void const>(std::make_shared<T>(t));
+
+    return ConcreteArgSpec(type_idx, ptr);
+    // ArgTypeRuntimeTag::create<T>());
+  }
+
+private:
+  ConcreteArgSpec(std::type_index const &type_index,
+                  std::shared_ptr<void const> ptr)
+      : type_idx(type_index), ptr(ptr) {}
+  // ArgTypeRuntimeTag const &);
+
+  // ArgTypeRuntimeTag type_tag;
+  std::type_index type_idx;
+  std::shared_ptr<void const> ptr;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/include/runtime/config.h b/lib/local-execution/include/local-execution/config.h
similarity index 89%
rename from lib/runtime/include/runtime/config.h
rename to lib/local-execution/include/local-execution/config.h
index 34f45040d1..73653aebae 100644
--- a/lib/runtime/include/runtime/config.h
+++ b/lib/local-execution/include/local-execution/config.h
@@ -13,12 +13,11 @@
  * limitations under the License.
  */
 
-#ifndef _FLEXFLOW_CONFIG_H_
-#define _FLEXFLOW_CONFIG_H_
-#include "legion.h"
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_
+#define _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_
+
 #include "op-attrs/param_sync.h"
 #include "utils/fmt.h"
-#include "utils/optional.h"
 #include "utils/visitable.h"
 #include <cstring>
 
@@ -47,6 +46,8 @@ struct FFInitInfo : public use_visitable_cmp<FFInitInfo> {
   bool allowTensorOpMathConversion;
 };
 
+using legion_mapping_tag_id_t = unsigned long;
+
 struct FFConfig : public use_visitable_cmp<FFConfig> {
 public:
   enum PreservedIDs {
@@ -64,7 +65,7 @@ struct FFConfig : public use_visitable_cmp<FFConfig> {
   };
 
   FFConfig() = default;
-  static Legion::MappingTagID get_hash_id(std::string const &pcname);
+  static legion_mapping_tag_id_t get_hash_id(std::string const &pcname);
 
 public:
   int epochs = 1;
@@ -88,16 +89,17 @@ struct FFConfig : public use_visitable_cmp<FFConfig> {
   bool enable_inplace_optimizations = false;
   // Control Tensor Op Math Conversion
   bool allow_tensor_op_math_conversion = false;
-  optional<std::string> dataset_path = nullopt;
-  optional<std::string> export_strategy_computation_graph_file = nullopt;
+  std::optional<std::string> dataset_path = std::nullopt;
+  std::optional<std::string> export_strategy_computation_graph_file =
+      std::nullopt;
   bool include_costs_dot_graph = false;
-  optional<std::string> substitution_json_path = nullopt;
+  std::optional<std::string> substitution_json_path = std::nullopt;
   int machine_model_version = 0;
-  optional<std::string> machine_model_file = nullopt;
+  std::optional<std::string> machine_model_file = std::nullopt;
   int simulator_segment_size = 16777216; // 16 MB
   int simulator_max_num_segments = 1;
-  optional<int> search_num_nodes = nullopt;
-  optional<int> search_num_workers = nullopt;
+  std::optional<int> search_num_nodes = std::nullopt;
+  std::optional<int> search_num_workers = std::nullopt;
   int base_optimize_threshold = 10;
   bool enable_control_replication = true;
   // The default python data loader type is 2 to enable control replication
diff --git a/lib/runtime/src/cost_metrics.h b/lib/local-execution/include/local-execution/cost_metrics.h
similarity index 95%
rename from lib/runtime/src/cost_metrics.h
rename to lib/local-execution/include/local-execution/cost_metrics.h
index 77526ccd1a..edc0190daf 100644
--- a/lib/runtime/src/cost_metrics.h
+++ b/lib/local-execution/include/local-execution/cost_metrics.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H
-#define _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H
 
 #include "utils/visitable.h"
 
diff --git a/lib/runtime/src/task_spec/device_specific.h b/lib/local-execution/include/local-execution/device_specific.h
similarity index 64%
rename from lib/runtime/src/task_spec/device_specific.h
rename to lib/local-execution/include/local-execution/device_specific.h
index e29e4e9450..6136d16f2d 100644
--- a/lib/runtime/src/task_spec/device_specific.h
+++ b/lib/local-execution/include/local-execution/device_specific.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H
-#define _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
+#define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
 
-#include "serialization.h"
+#include "local-execution/serialization.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -10,10 +10,17 @@ template <typename T>
 struct DeviceSpecific {
 
   DeviceSpecific() = delete;
+  DeviceSpecific(T ptr_type) { // accessor
+    size_t device_idx = 0;
+    DeviceSpecific<T> device_specific =
+        DeviceSpecific::create(device_idx, ptr_type);
+    this->ptr = device_specific.ptr;
+    this->device_idx = device_specific.device_idx;
+  }
 
   template <typename... Args>
   static DeviceSpecific<T> create(size_t device_idx, Args &&...args) {
-    NOT_IMPLEMENTED();
+    NOT_IMPLEMENTED(); // accessor
   }
 
   T const *get(size_t curr_device_idx) const {
@@ -26,6 +33,8 @@ struct DeviceSpecific {
     return this->ptr;
   }
 
+  // TODO: can modify ptr
+
 private:
   T *ptr;
   size_t device_idx;
diff --git a/lib/runtime/src/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h
similarity index 92%
rename from lib/runtime/src/legion_tensor_shape.h
rename to lib/local-execution/include/local-execution/legion_tensor_shape.h
index 1f5fab76a6..ff96ba9a15 100644
--- a/lib/runtime/src/legion_tensor_shape.h
+++ b/lib/local-execution/include/local-execution/legion_tensor_shape.h
@@ -28,8 +28,8 @@ struct LegionTensorShape : public use_visitable_cmp<LegionTensorShape>,
   DataType data_type;
 };
 
-ff_dim_t to_ff(legion_dim_t, int num_dims);
-legion_dim_t to_legion(ff_dim_t, int num_dims);
+ff_dim_t to_ff(legion_dim_t, size_t num_dims);
+legion_dim_t to_legion(ff_dim_t, size_t num_dims);
 
 ff_dim_t to_ff(legion_dim_t, TensorShape const &);
 legion_dim_t to_legion(ff_dim_t, TensorShape const &);
diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local-execution/local_allocator.h
similarity index 82%
rename from lib/local-execution/include/local_allocator.h
rename to lib/local-execution/include/local-execution/local_allocator.h
index f4b253b281..b47220eb8c 100644
--- a/lib/local-execution/include/local_allocator.h
+++ b/lib/local-execution/include/local-execution/local_allocator.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
-#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H
 
 #include "kernels/allocation.h"
 #include <unordered_set>
diff --git a/lib/runtime/src/task_spec/op_arg_ref.h b/lib/local-execution/include/local-execution/op_arg_ref.h
similarity index 54%
rename from lib/runtime/src/task_spec/op_arg_ref.h
rename to lib/local-execution/include/local-execution/op_arg_ref.h
index 3e931d79a4..1650656b42 100644
--- a/lib/runtime/src/task_spec/op_arg_ref.h
+++ b/lib/local-execution/include/local-execution/op_arg_ref.h
@@ -1,8 +1,8 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
 
-#include "arg_ref.h"
-#include "device_specific.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/device_specific.h"
 #include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
@@ -15,13 +15,9 @@ using OpArgRef = ArgRef<OpArgRefType, T>;
 using OpArgRefSpec = ArgRefSpec<OpArgRefType>;
 
 template <typename T>
-OpArgRef<DeviceSpecific<T>> per_device_op_state() {
-  return {OpArgRefType::PER_DEVICE_OP_STATE};
-}
+OpArgRef<DeviceSpecific<T>> per_device_op_state();
 
-OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx) {
-  return {OpArgRefType::PARALLEL_TENSOR_SHAPE};
-}
+OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h
new file mode 100644
index 0000000000..37ca5c239d
--- /dev/null
+++ b/lib/local-execution/include/local-execution/op_task_invocation.h
@@ -0,0 +1,97 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
+
+#include "kernels/accessor.h"
+#include "local-execution/concrete_arg.h"
+#include "local-execution/op_arg_ref.h"
+#include "local-execution/op_task_signature.h"
+#include "local-execution/op_tensor_spec.h"
+#include "local-execution/profiling.h"
+#include "local-execution/runtime_arg_ref.h"
+#include "local-execution/tasks.h"
+#include "local-execution/variadic_tensor_ref.h"
+#include "utils/bidict.h"
+#include "utils/stack_map.h"
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+
+namespace FlexFlow {
+
+enum class IsTrainable { YES, NO };
+
+using OpArgSpec =
+    std::variant<ConcreteArgSpec, OpArgRefSpec, RuntimeArgRefSpec>;
+
+struct OpTaskBinding {
+  OpTaskBinding() = default;
+
+  void bind(slot_id, VariadicTensorRef<OpTensorSpec> const &) {
+    NOT_IMPLEMENTED();
+  }
+  void bind(slot_id, OpTensorSpec const &);
+  void bind_grad(slot_id, OpTensorSpec const &);
+
+  template <typename T>
+  void bind_device_specific_arg(slot_id name, T const &t) {
+    NOT_IMPLEMENTED();
+  }
+
+  template <typename T>
+  void bind_device_specific_arg(slot_id name, OpArgRef<T> const &t) {
+    NOT_IMPLEMENTED();
+  }
+
+  template <typename T>
+  void bind_arg(slot_id name, T const &t) {
+    this->insert_arg_spec(name, ConcreteArgSpec::create(t));
+  }
+
+  template <typename T>
+  void bind_arg(slot_id name, RuntimeArgRef<T> const &ref) {
+    this->insert_arg_spec(name, RuntimeArgRefSpec::create(ref));
+  }
+
+  template <typename T>
+  void bind_arg(slot_id name, OpArgRef<T> const &ref) {
+    this->insert_arg_spec(name, OpArgRefSpec::create(ref));
+  }
+
+  std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
+      get_tensor_bindings() const;
+  std::unordered_map<slot_id, OpArgSpec> const &get_arg_bindings() const;
+
+  void insert_arg_spec(slot_id name, OpArgSpec const &arg_spec) {
+    assert(!contains_key(this->arg_bindings, name));
+    this->arg_bindings.insert({name, arg_spec});
+  }
+
+  std::unordered_map<slot_id, OpArgSpec> arg_bindings;
+  std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> tensor_bindings;
+};
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskBinding,
+                                             arg_bindings,
+                                             tensor_bindings);
+
+struct OpTaskInvocation {
+public:
+  OpTaskInvocation() = delete;
+  OpTaskInvocation(task_id_t const &task_id, OpTaskBinding const &binding)
+      : task_id(task_id), binding(binding) {}
+
+public:
+  task_id_t task_id;
+  OpTaskBinding binding;
+};
+FF_VISITABLE_STRUCT(OpTaskInvocation, task_id, binding);
+
+OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd);
+OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
+
+bool is_invocation_valid(OpTaskSignature const &sig,
+                         OpTaskInvocation const &inv);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/src/task_spec/op_task_signature.h b/lib/local-execution/include/local-execution/op_task_signature.h
similarity index 73%
rename from lib/runtime/src/task_spec/op_task_signature.h
rename to lib/local-execution/include/local-execution/op_task_signature.h
index 656df39309..3bcb8397b7 100644
--- a/lib/runtime/src/task_spec/op_task_signature.h
+++ b/lib/local-execution/include/local-execution/op_task_signature.h
@@ -1,8 +1,11 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H
-#define _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H
-
-#include "task_invocation.h"
-#include "task_signature.h"
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
+
+#include "local-execution/serialization.h"
+#include "local-execution/slot_id.h"
+#include "local-execution/slot_type.h"
+#include "local-execution/tasks.h"
+#include "utils/type_index.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
@@ -14,6 +17,7 @@ enum class TensorRole {
 };
 
 enum class OpTaskType { INIT, FWD, BWD };
+enum class IsGrad { YES, NO };
 
 enum class OpSlotOptions {
   OPTIONAL,
@@ -25,7 +29,6 @@ enum class OpSlotOptions {
 struct OpTensorSlotSpec {
 public:
   OpTensorSlotSpec() = delete;
-  OpTensorSlotSpec(slot_id, SlotType, TensorRole);
 
 public:
   slot_id name;
@@ -41,7 +44,9 @@ struct OpTaskSignature {
   OpTaskSignature() = delete;
   explicit OpTaskSignature(OpTaskType);
 
-  OpTaskType get_task_type() const;
+  OpTaskType get_task_type() const {
+    return this->type;
+  }
 
   void add_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR);
   void add_optional_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR);
@@ -59,45 +64,35 @@ struct OpTaskSignature {
 
   void add_from_slot_spec(OpTensorSlotSpec const &spec);
 
-  /* void add_input_slot(slot_id, Legion::PrivilegeMode); */
-  /* void add_input_slot(slot_id, SlotType, Legion::PrivilegeMode); */
-
-  bool operator==(OpTaskSignature const &) const;
-  bool operator!=(OpTaskSignature const &) const;
-
   template <typename T>
   void add_arg_slot(slot_id name) {
     static_assert(is_serializable<T>::value, "Type must be serializable");
+    this->task_arg_types.insert({name, get_type_index_for_type<T>()});
   }
 
   template <typename T>
-  void add_return_value();
+  void add_return_value() {
+    this->return_value = get_type_index_for_type<T>();
+  }
 
   // adds arg_slot without checking is_serializable, used for arguments that are
   // deviceSpecific
   template <typename T>
   void add_unchecked_arg_slot(slot_id name) {
-    NOT_IMPLEMENTED();
+    this->task_arg_types.insert({name, get_type_index_for_type<T>()});
   }
 
-  std::unordered_set<OpTensorSlotSpec> get_tensor_slots();
+  std::unordered_set<OpTensorSlotSpec> get_tensor_slots() const;
   void set_arg_types(std::unordered_map<slot_id, std::type_index> const &);
-  std::unordered_map<slot_id, std::type_index> get_arg_types();
+  std::unordered_map<slot_id, std::type_index> get_arg_types() const;
 
-private:
+  OpTaskType type;
+  std::optional<std::type_index> return_value;
   std::unordered_map<slot_id, std::type_index> task_arg_types;
   std::unordered_set<OpTensorSlotSpec> op_tensor_slots;
 };
-
-template <task_id_t>
-OpTaskSignature init_signature();
-template <task_id_t>
-OpTaskSignature fwd_signature();
-template <task_id_t>
-OpTaskSignature bwd_signature();
-
-template <task_id_t>
-OpTaskSignature get_signature();
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
+    OpTaskSignature, type, return_value, task_arg_types, op_tensor_slots);
 
 template <typename F>
 void register_task(task_id_t,
@@ -112,6 +107,15 @@ void register_task(task_id_t,
                    F const &func,
                    F const &cpu_func);
 
+template <task_id_t>
+OpTaskSignature init_signature();
+
+template <task_id_t>
+OpTaskSignature fwd_signature();
+
+template <task_id_t>
+OpTaskSignature bwd_signature();
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/op_tensor_spec.h b/lib/local-execution/include/local-execution/op_tensor_spec.h
new file mode 100644
index 0000000000..cc2cd75153
--- /dev/null
+++ b/lib/local-execution/include/local-execution/op_tensor_spec.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
+
+#include "local-execution/op_task_signature.h"
+
+namespace FlexFlow {
+
+struct OpTensorSpec {
+  TensorRole role;
+  OpSlotOptions slot_option;
+  req<int> idx;
+};
+FF_VISITABLE_STRUCT(OpTensorSpec, role, slot_option, idx);
+
+OpTensorSpec input_tensor(int);
+OpTensorSpec output_tensor(int);
+OpTensorSpec weight_tensor(int);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/src/permissions.h b/lib/local-execution/include/local-execution/permissions.h
similarity index 84%
rename from lib/runtime/src/permissions.h
rename to lib/local-execution/include/local-execution/permissions.h
index e7793a1dcb..ce19e38e7e 100644
--- a/lib/runtime/src/permissions.h
+++ b/lib/local-execution/include/local-execution/permissions.h
@@ -1,18 +1,13 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_PERMISSION_H
-#define _FLEXFLOW_RUNTIME_SRC_PERMISSION_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
 
-#include "legion.h"
 #include "utils/exception.h"
 #include "utils/fmt.h"
-#include "utils/optional.h"
 
 namespace FlexFlow {
 
 enum class Permissions { NONE, RO, WO, RW };
 
-Legion::PrivilegeMode to_legion(Permissions);
-optional<Permissions> from_legion(Legion::PrivilegeMode);
-
 Permissions join(Permissions lhs, Permissions rhs);
 Permissions meet(Permissions lhs, Permissions rhs);
 
diff --git a/lib/runtime/include/runtime/profiling.h b/lib/local-execution/include/local-execution/profiling.h
similarity index 64%
rename from lib/runtime/include/runtime/profiling.h
rename to lib/local-execution/include/local-execution/profiling.h
index 3f43ede520..bd50801fc4 100644
--- a/lib/runtime/include/runtime/profiling.h
+++ b/lib/local-execution/include/local-execution/profiling.h
@@ -1,21 +1,20 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_PROFILING_H
-#define _FLEXFLOW_RUNTIME_SRC_PROFILING_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H
 
 #include "kernels/profiling.h"
-#include "legion.h"
-#include "loggers.h"
+#include "spdlog/spdlog.h"
 
 namespace FlexFlow {
 
 enum class EnableProfiling { YES, NO };
 
 template <typename F, typename... Ts, typename Str>
-optional<float>
+std::optional<float>
     profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) {
-  optional<float> elapsed =
+  std::optional<float> elapsed =
       profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
   if (elapsed.has_value()) {
-    log_profile.debug(s, elapsed.value());
+    spdlog::debug(s, elapsed.value());
   }
   return elapsed;
 }
diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h
similarity index 73%
rename from lib/runtime/src/task_spec/runtime_arg_ref.h
rename to lib/local-execution/include/local-execution/runtime_arg_ref.h
index 655300e692..295f32455c 100644
--- a/lib/runtime/src/task_spec/runtime_arg_ref.h
+++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 
-#include "arg_ref.h"
-#include "device_specific.h"
-#include "runtime/config.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/config.h"
+#include "local-execution/device_specific.h"
 
 namespace FlexFlow {
 
@@ -20,7 +20,7 @@ using RuntimeArgRefSpec = ArgRefSpec<RuntimeArgRefType>;
 
 RuntimeArgRef<ProfilingSettings> profiling_settings();
 RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
-RuntimeArgRef<FFIterationConfig> iteration_config();
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> iteration_config();
 
 } // namespace FlexFlow
 
diff --git a/lib/runtime/src/serialization.h b/lib/local-execution/include/local-execution/serialization.h
similarity index 55%
rename from lib/runtime/src/serialization.h
rename to lib/local-execution/include/local-execution/serialization.h
index 65601990b0..147ed8159c 100644
--- a/lib/runtime/src/serialization.h
+++ b/lib/local-execution/include/local-execution/serialization.h
@@ -1,12 +1,9 @@
-#ifndef _FLEXFLOW_RUNTIME_SERIALIZATION_H
-#define _FLEXFLOW_RUNTIME_SERIALIZATION_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H
 
 #include "kernels/device.h"
 #include "kernels/nccl.h"
-#include "legion.h"
-#include "legion/legion_utilities.h"
 #include "op-attrs/dim_ordered.h"
-#include "utils/optional.h"
 #include "utils/required.h"
 #include "utils/type_traits.h"
 #include "utils/variant.h"
@@ -28,23 +25,6 @@ namespace FlexFlow {
 template <typename T>
 struct needs_serialization {};
 
-/* template <typename T> */
-/* class Serializer { */
-/*   void serialize(Legion::Serializer &, T const &) const; */
-/*   void deserialize(Legion::Deserializer &, T &) const; */
-/* }; */
-
-/* template <typename T, class Enable = void> struct trivially_serializable; */
-
-/* template <typename T, int i, class Enable = void> struct
- * visit_trivially_serializable; */
-
-/* template <typename T, */
-/*           int i, */
-/*           typename
- * std::enable_if<(needs_serialization<visit_struct::type_at<i, T>>::value &&
- * visit_serializable<T, (i+1)>::value)>::type> */
-
 template <typename... Args>
 struct visit_trivially_serializable;
 
@@ -101,6 +81,10 @@ struct is_trivially_serializable<
     typename std::enable_if<std::is_floating_point<T>::value>::type>
     : std::true_type {};
 
+template <typename T, std::size_t MAXSIZE>
+struct is_trivially_serializable<stack_vector<T, MAXSIZE>>
+    : is_trivially_serializable<T> {};
+
 template <typename Idx, typename T>
 struct is_trivially_serializable<DimOrdered<Idx, T>>
     : is_trivially_serializable<T> {};
@@ -155,108 +139,6 @@ static_assert(std::is_same<visit_as_tuple_t<InternalTestType>,
 static_assert(visit_trivially_serializable<InternalTestType>::value, "");
 static_assert(is_trivially_serializable<InternalTestType>::value, "");
 
-template <typename T, typename Enable = void>
-struct Serialization {
-  void serialize(Legion::Serializer &, T const &) const;
-  T deserialize(Legion::Deserializer &) const;
-};
-
-template <typename T>
-struct Serialization<
-    T,
-    typename std::enable_if<is_trivially_serializable<T>::value>::type> {
-  static void serialize(Legion::Serializer &sez, T const &t) {
-    sez.serialize(&t, sizeof(T));
-  }
-
-  static T const &deserialize(Legion::Deserializer &dez) {
-    void const *cur = dez.get_current_pointer();
-    dez.advance_pointer(sizeof(T));
-    return *(T const *)cur;
-  }
-};
-
-struct needs_serialize_visitor {
-  bool result = true;
-
-  template <typename T>
-  void operator()(char const *, T const &t) {
-    result &= needs_serialize(t);
-  }
-};
-
-template <typename T>
-bool visit_needs_serialize(T const &t) {
-  needs_serialize_visitor vis;
-  visit_struct::for_each(t, vis);
-  return vis.result;
-}
-
-struct serialize_visitor {
-  serialize_visitor() = delete;
-  explicit serialize_visitor(Legion::Serializer &sez) : sez(sez) {}
-
-  Legion::Serializer &sez;
-
-  template <typename T>
-  void operator()(char const *, T const &t) {
-    serialize(this->sez, t);
-  }
-};
-
-template <typename T>
-void visit_serialize(Legion::Serializer &sez, T const &t) {
-  serialize_visitor vis(sez);
-  visit_struct::for_each(t, vis);
-}
-
-struct deserialize_visitor {
-  deserialize_visitor() = delete;
-  explicit deserialize_visitor(Legion::Deserializer &dez) : dez(dez) {}
-
-  Legion::Deserializer &dez;
-
-  template <typename T>
-  T const &operator()(char const *, T &t) {
-    deserialize(dez, t);
-  }
-};
-
-template <typename T>
-T const &visit_deserialize(Legion::Deserializer &dez) {
-  deserialize_visitor vis(dez);
-  return visit_struct::for_each<T>(vis);
-}
-
-template <typename T>
-class VisitSerialize {
-  void serialize(Legion::Serializer &sez, T const &t) const {
-    return visit_serialize(sez, t);
-  }
-
-  T const &deserialize(Legion::Deserializer &dez) const {
-    return visit_deserialize<T>(dez);
-  }
-};
-
-template <typename T>
-size_t ff_task_serialize(Legion::Serializer &sez, T const &t) {
-  static_assert(is_serializable<T>::value, "Type must be serializable");
-
-  size_t pre_size = sez.get_used_bytes();
-  Serialization<T>::serialize(sez, t);
-  size_t post_size = sez.get_used_bytes();
-
-  return post_size - pre_size;
-}
-
-template <typename T>
-T const &ff_task_deserialize(Legion::Deserializer &dez) {
-  static_assert(is_serializable<T>::value, "Type must be serializable");
-
-  return Serialization<T>::deserialize(dez);
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/runtime/src/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h
similarity index 94%
rename from lib/runtime/src/sim_environment.h
rename to lib/local-execution/include/local-execution/sim_environment.h
index 4297d9d970..78608a3228 100644
--- a/lib/runtime/src/sim_environment.h
+++ b/lib/local-execution/include/local-execution/sim_environment.h
@@ -1,12 +1,13 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H
-#define _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
+#define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
 
-#include "cost_metrics.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
+#include "local-execution/cost_metrics.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/task_argument_accessor.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "task_spec/op_task_invocation.h"
-#include "task_spec/task_argument_accessor.h"
+#include "pcg/machine_view.h"
 #include <vector>
 
 namespace FlexFlow {
diff --git a/lib/runtime/include/runtime/task_spec/slot_id.h b/lib/local-execution/include/local-execution/slot_id.h
similarity index 73%
rename from lib/runtime/include/runtime/task_spec/slot_id.h
rename to lib/local-execution/include/local-execution/slot_id.h
index a5e4322d3c..53820fdb2f 100644
--- a/lib/runtime/include/runtime/task_spec/slot_id.h
+++ b/lib/local-execution/include/local-execution/slot_id.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H
 
 #include "utils/strong_typedef.h"
 
diff --git a/lib/runtime/src/task_spec/slot_type.h b/lib/local-execution/include/local-execution/slot_type.h
similarity index 86%
rename from lib/runtime/src/task_spec/slot_type.h
rename to lib/local-execution/include/local-execution/slot_type.h
index 64b79ee281..957f89fa4e 100644
--- a/lib/runtime/src/task_spec/slot_type.h
+++ b/lib/local-execution/include/local-execution/slot_type.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H
 
 #include "utils/fmt.h"
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
new file mode 100644
index 0000000000..663c862e18
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -0,0 +1,155 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/linear_kernels.h"
+#include "local-execution/arg_ref.h"
+#include "local-execution/concrete_arg.h"
+#include "local-execution/config.h"
+#include "local-execution/device_specific.h"
+#include "local-execution/op_task_signature.h"
+#include "local-execution/permissions.h"
+#include "local-execution/tasks.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "utils/variant.h"
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+namespace FlexFlow {
+
+template <Permissions>
+struct privilege_mode_to_accessor_t {};
+
+template <>
+struct privilege_mode_to_accessor_t<Permissions::RW> {
+  using type = GenericTensorAccessorW;
+};
+
+template <>
+struct privilege_mode_to_accessor_t<Permissions::RO> {
+  using type = GenericTensorAccessorR;
+};
+
+template <>
+struct privilege_mode_to_accessor_t<Permissions::WO> {
+  using type = GenericTensorAccessorW;
+};
+
+template <Permissions PRIV>
+using privilege_mode_to_accessor =
+    typename privilege_mode_to_accessor_t<PRIV>::type;
+
+using PrivilegeType =
+    std::variant<GenericTensorAccessorR, GenericTensorAccessorW>;
+using PrivilegeVariadicType = std::variant<std::vector<GenericTensorAccessorR>,
+                                           std::vector<GenericTensorAccessorW>>;
+
+// TODO: define device state variant in another file
+using DeviceStates = std::variant<LinearPerDeviceState>;
+
+using OpArgRefTypeBacking =
+    std::variant<ParallelTensorShape, DeviceSpecific<DeviceStates>>;
+using RuntimeArgRefTypeBacking = std::variant<ProfilingSettings,
+                                              DeviceSpecific<PerDeviceFFHandle>,
+                                              FFIterationConfig>;
+
+using ArgRefBacking = std::
+    variant<OpArgRefTypeBacking, RuntimeArgRefTypeBacking, ConcreteArgSpec>;
+
+struct ITaskArgumentAccessor {
+  ITaskArgumentAccessor &operator=(ITaskArgumentAccessor const &) = delete;
+
+  virtual ~ITaskArgumentAccessor() = default;
+
+  virtual ConcreteArgSpec const &get_concrete_arg(slot_id) const = 0;
+  virtual OpArgRefTypeBacking const &get_op_arg_ref(slot_id) const = 0;
+  virtual RuntimeArgRefTypeBacking const &get_runtime_arg(slot_id) const = 0;
+
+  virtual PrivilegeType
+      get_tensor(slot_id slot, Permissions priv, IsGrad is_grad) const = 0;
+  virtual PrivilegeVariadicType get_variadic_tensor(slot_id slot,
+                                                    Permissions priv,
+                                                    IsGrad is_grad) const = 0;
+
+  virtual Allocator get_allocator() const = 0;
+  virtual size_t get_device_idx() const = 0;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(ITaskArgumentAccessor);
+
+struct TaskArgumentAccessor {
+  template <typename T>
+  T const &get_argument(slot_id slot) const {
+    if constexpr (is_in_variant<T, OpArgRefTypeBacking>::value) {
+      return std::get<T>(this->ptr->get_op_arg_ref(slot));
+    } else if constexpr (is_in_variant<T, RuntimeArgRefTypeBacking>::value) {
+      return std::get<T>(this->ptr->get_runtime_arg(slot));
+    } else {
+      return this->ptr->get_concrete_arg(slot).get<T>();
+    }
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, IsGrad::NO));
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, IsGrad::YES));
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO));
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES));
+  }
+
+  Allocator get_allocator() const {
+    return this->ptr->get_allocator();
+  }
+
+  template <typename T, typename... Args>
+  static
+      typename std::enable_if<std::is_base_of<ITaskArgumentAccessor, T>::value,
+                              TaskArgumentAccessor>::type
+      create(Args &&...args) {
+    return TaskArgumentAccessor(
+        std::make_shared<T>(std::forward<Args>(args)...));
+  }
+
+private:
+  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> ptr)
+      : ptr(ptr) {}
+  std::shared_ptr<ITaskArgumentAccessor const> ptr;
+};
+
+using DeviceStates = std::variant<LinearPerDeviceState>;
+
+using TaskImplFunction = std::variant<
+    std::function<DeviceStates(TaskArgumentAccessor const &)>,
+    std::function<std::optional<float>(TaskArgumentAccessor const &)>>;
+
+template <task_id_t>
+TaskImplFunction get_task_impl();
+
+template <task_id_t>
+OpTaskSignature get_signature();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/src/tasks.h b/lib/local-execution/include/local-execution/tasks.h
similarity index 95%
rename from lib/runtime/src/tasks.h
rename to lib/local-execution/include/local-execution/tasks.h
index 0e07fa3f85..c78fefd4ea 100644
--- a/lib/runtime/src/tasks.h
+++ b/lib/local-execution/include/local-execution/tasks.h
@@ -1,8 +1,9 @@
-#ifndef _FLEXFLOW_TASKS_H
-#define _FLEXFLOW_TASKS_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
 
-#include "utils/optional.h"
+#include <optional>
 #include <string>
+#include <variant>
 
 namespace FlexFlow {
 
@@ -170,9 +171,9 @@ template <typename F>
 void register_task(task_id_t,
                    std::string const &name,
                    F const &func,
-                   optional<F const &> cpu_func = nullopt);
+                   std::optional<F const &> cpu_func = std::nullopt);
 
-template <task_id_t>
+template <task_id_t id>
 void register_task();
 
 void register_tasks();
diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
similarity index 94%
rename from lib/local-execution/include/tracked_allocator.h
rename to lib/local-execution/include/local-execution/tracked_allocator.h
index 4f51670426..ea3eec64e0 100644
--- a/lib/local-execution/include/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 
 #include "kernels/allocation.h"
-#include "local_allocator.h"
+#include "local-execution/local_allocator.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/variadic_tensor_ref.h b/lib/local-execution/include/local-execution/variadic_tensor_ref.h
new file mode 100644
index 0000000000..56da1bab64
--- /dev/null
+++ b/lib/local-execution/include/local-execution/variadic_tensor_ref.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
+#define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
+
+#include "local-execution/arg_ref.h"
+#include "local-execution/op_tensor_spec.h"
+
+namespace FlexFlow {
+
+enum class VariadicTensorRefType { INPUT_TENSORS };
+
+template <typename T>
+using VariadicTensorRef = ArgRef<VariadicTensorRefType, T>;
+
+VariadicTensorRef<OpTensorSpec> get_input_tensors();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/src/local_allocator.cc b/lib/local-execution/src/local_allocator.cc
index 0bb7d04574..d393643ead 100644
--- a/lib/local-execution/src/local_allocator.cc
+++ b/lib/local-execution/src/local_allocator.cc
@@ -1,4 +1,4 @@
-#include "local_allocator.h"
+#include "local-execution/local_allocator.h"
 #include "kernels/device.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc
new file mode 100644
index 0000000000..8e9b56272b
--- /dev/null
+++ b/lib/local-execution/src/op_arg_ref.cc
@@ -0,0 +1,14 @@
+#include "local-execution/op_arg_ref.h"
+
+namespace FlexFlow {
+
+template <typename T>
+OpArgRef<DeviceSpecific<T>> per_device_op_state() {
+  return {OpArgRefType::PER_DEVICE_OP_STATE};
+}
+
+OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx) {
+  return {OpArgRefType::PARALLEL_TENSOR_SHAPE};
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
new file mode 100644
index 0000000000..adad2f3a72
--- /dev/null
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -0,0 +1,100 @@
+#include "local-execution/op_task_invocation.h"
+
+namespace FlexFlow {
+
+OpTensorSpec input_tensor(int idx,
+                          OpSlotOptions option = OpSlotOptions::NECESSARY) {
+  return {TensorRole::INPUT, option, idx};
+}
+
+OpTensorSpec output_tensor(int idx,
+                           OpSlotOptions option = OpSlotOptions::NECESSARY) {
+  return {TensorRole::OUTPUT, option, idx};
+}
+
+OpTensorSpec weight_tensor(int idx,
+                           OpSlotOptions option = OpSlotOptions::NECESSARY) {
+  return {TensorRole::WEIGHT, option, idx};
+}
+
+void OpTaskBinding::bind(slot_id slot, OpTensorSpec const &tensor_spec) {
+  this->tensor_bindings.insert({{slot, IsGrad::NO}, tensor_spec});
+}
+
+void OpTaskBinding::bind_grad(slot_id slot, OpTensorSpec const &tensor_spec) {
+  this->tensor_bindings.insert({{slot, IsGrad::YES}, tensor_spec});
+}
+
+std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
+    OpTaskBinding::get_tensor_bindings() const {
+  return this->tensor_bindings;
+}
+
+std::unordered_map<slot_id, OpArgSpec> const &
+    OpTaskBinding::get_arg_bindings() const {
+  return this->arg_bindings;
+}
+
+OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) {
+  OpTaskBinding bwd;
+  bwd.arg_bindings = fwd.get_arg_bindings();
+  bwd.tensor_bindings = fwd.get_tensor_bindings();
+  for (auto const &[key, spec] : fwd.get_tensor_bindings()) {
+    OpSlotOptions slot_option = spec.slot_option;
+    if (slot_option != OpSlotOptions::UNTRAINABLE ||
+        slot_option != OpSlotOptions::OPTIONAL_UNTRAINABLE) {
+      slot_id slot = key.first;
+      bwd.bind_grad(slot, spec);
+    }
+  }
+  return bwd;
+}
+
+bool is_op_tensor_spec_invalid(OpTensorSlotSpec tensor_slot_spec,
+                               OpTensorSpec tensor_spec) {
+  return tensor_spec.role != tensor_slot_spec.tensor_role ||
+         tensor_spec.slot_option != tensor_slot_spec.slot_option;
+}
+
+bool is_tensor_invocation_valid(OpTaskSignature const &sig,
+                                OpTaskInvocation const &inv) {
+  auto tensor_bindings = inv.binding.get_tensor_bindings();
+  for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
+    std::pair<slot_id, IsGrad> tensor_key =
+        std::make_pair(op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad);
+    OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key);
+    if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool is_arg_type_invalid(std::type_index expected_arg_type,
+                         OpArgSpec op_arg_spec) {
+  std::type_index arg_spec_type = std::visit(
+      [](auto &&arg) -> std::type_index { return arg.get_type_index(); },
+      op_arg_spec);
+  return arg_spec_type != expected_arg_type;
+}
+
+bool is_arg_invocation_valid(OpTaskSignature const &sig,
+                             OpTaskInvocation const &inv) {
+  auto sig_arg_types = sig.get_arg_types();
+  for (auto arg_binding : inv.binding.get_arg_bindings()) {
+    std::type_index arg_type = sig_arg_types.at(arg_binding.first);
+    if (is_arg_type_invalid(arg_type, arg_binding.second)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool is_invocation_valid(OpTaskSignature const &sig,
+                         OpTaskInvocation const &inv) {
+  return is_tensor_invocation_valid(sig, inv) &&
+         is_arg_invocation_valid(sig, inv);
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
new file mode 100644
index 0000000000..53a685910e
--- /dev/null
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -0,0 +1,81 @@
+#include "local-execution/op_task_signature.h"
+
+namespace FlexFlow {
+
+OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){};
+
+void OpTaskSignature::add_input_slot(slot_id name, SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_optional_input_slot(slot_id name,
+                                              SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_untrainable_input_slot(slot_id name,
+                                                 SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::INPUT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::UNTRAINABLE};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_optional_untrainable_input_slot(slot_id name,
+                                                          SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::INPUT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::OPTIONAL_UNTRAINABLE};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_output_slot(slot_id name, SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_bwd_necessary_output_slot(slot_id name,
+                                                    SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::OUTPUT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::NECESSARY};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_weight_slot(slot_id name, SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {name,
+                                          slot_type,
+                                          TensorRole::WEIGHT,
+                                          IsGrad::NO,
+                                          OpSlotOptions::NECESSARY};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::add_optional_weight_slot(slot_id name,
+                                               SlotType slot_type) {
+  OpTensorSlotSpec op_tensor_slot_spec = {
+      name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  this->op_tensor_slots.insert(op_tensor_slot_spec);
+}
+
+void OpTaskSignature::set_arg_types(
+    std::unordered_map<slot_id, std::type_index> const &arg_type) {
+  this->task_arg_types = arg_type;
+}
+
+void OpTaskSignature::add_from_slot_spec(OpTensorSlotSpec const &spec) {
+  this->op_tensor_slots.insert(spec);
+}
+
+} // namespace FlexFlow
diff --git a/lib/runtime/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
similarity index 84%
rename from lib/runtime/src/ops/attention.cc
rename to lib/local-execution/src/ops/attention.cc
index 41905f9014..6e6d23cd4a 100644
--- a/lib/runtime/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -15,19 +15,12 @@
 
 #include "attention.h"
 #include "kernels/attention_kernels.h"
-#include "legion.h"
-#include "op-attrs/ops/attention.h"
-#include "task_spec/op_task_signature.h"
+#include "local-execution/op_task_signature.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::MultiHeadAttention;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots {
   QUERY_PARALLEL_TENSOR_SHAPE,
   KEY_PARALLEL_TENSOR_SHAPE,
@@ -86,6 +79,12 @@ OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) {
   return {ATTENTION_BWD_TASK_ID, b};
 }
 
+// OpArgBacking
+// generate_op_arg_backing<ATTENTION_INIT_TASK_ID>(std::vector<ParallelTensorShape>
+// tensor_shape_args) {
+
+// }
+
 static DeviceSpecific<MHAPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
@@ -122,35 +121,42 @@ static DeviceSpecific<MHAPerDeviceState>
   int num_samples = get_piece_shape(query_parallel_tensor_shape)[ff_dim_t(2)];
   int num_heads = get_piece_shape(weight_parallel_tensor_shape)[ff_dim_t(1)];
 
+  // MHAPerDeviceState per_device_state =
+  //         init_kernel(handle,
+  //                     allocator,
+  //                     num_samples,
+  //                     num_heads,
+  //                     qSize,
+  //                     kSize,
+  //                     vSize,
+  //                     qProjSize,
+  //                     kProjSize,
+  //                     vProjSize,
+  //                     oProjSize,
+  //                     qoSeqLength,
+  //                     kvSeqLength,
+  //                     attrs.add_bias_kv);
+  // return acc.create_device_specific<MHAPerDeviceState>(per_device_state);
+
   DeviceSpecific<MHAPerDeviceState> per_device_state =
-      acc.create_device_specific<MHAPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      num_samples,
-                      num_heads,
-                      qSize,
-                      kSize,
-                      vSize,
-                      qProjSize,
-                      kProjSize,
-                      vProjSize,
-                      oProjSize,
-                      qoSeqLength,
-                      kvSeqLength,
-                      attrs.add_bias_kv));
+      init_kernel(handle,
+                  allocator,
+                  num_samples,
+                  num_heads,
+                  qSize,
+                  kSize,
+                  vSize,
+                  qProjSize,
+                  kProjSize,
+                  vProjSize,
+                  oProjSize,
+                  qoSeqLength,
+                  kvSeqLength,
+                  attrs.add_bias_kv);
   return per_device_state;
 }
 
-static DeviceSpecific<MHAPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto query = acc.get_tensor<Permissions::RO>(QUERY);
   auto key = acc.get_tensor<Permissions::RO>(KEY);
   auto value = acc.get_tensor<Permissions::RO>(VALUE);
@@ -162,7 +168,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[MultiHeadAttention] forward_time = %.2lfms\n",
+                 "[MultiHeadAttention] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  query.get_float_ptr(),
                  key.get_float_ptr(),
@@ -171,15 +177,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto query = acc.get_tensor<Permissions::RO>(QUERY);
   auto key = acc.get_tensor<Permissions::RO>(KEY);
   auto value = acc.get_tensor<Permissions::RO>(VALUE);
@@ -208,7 +207,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[MultiHeadAttention] backward_time = %.2lfms\n",
+                 "[MultiHeadAttention] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  query.get_float_ptr(),
                  query_grad.get_float_ptr(),
@@ -221,14 +220,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   MultiHeadAttentionAttrs const &attrs,
                                   InputParallelTensorDesc const &query_shape,
@@ -307,7 +298,12 @@ void register_task<ATTENTION_INIT_TASK_ID>() {
   register_task(ATTENTION_INIT_TASK_ID,
                 "Attention Init",
                 init_signature<ATTENTION_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
+}
+
+template <>
+OpTaskSignature get_signature<ATTENTION_INIT_TASK_ID>() {
+  return init_signature<ATTENTION_INIT_TASK_ID>();
 }
 
 template <>
@@ -331,13 +327,13 @@ void register_task<ATTENTION_FWD_TASK_ID>() {
   register_task(ATTENTION_FWD_TASK_ID,
                 "Attention Fwd",
                 fwd_signature<ATTENTION_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
 OpTaskSignature bwd_signature<ATTENTION_BWD_TASK_ID>() {
   OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(ATTENTION_FWD_TASK_ID));
+      infer_bwd_signature(fwd_signature<ATTENTION_FWD_TASK_ID>());
 
   return bwd;
 }
@@ -347,7 +343,7 @@ void register_task<ATTENTION_BWD_TASK_ID>() {
   register_task(ATTENTION_BWD_TASK_ID,
                 "Attention Bwd",
                 bwd_signature<ATTENTION_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/attention.h b/lib/local-execution/src/ops/attention.h
similarity index 91%
rename from lib/runtime/src/ops/attention.h
rename to lib/local-execution/src/ops/attention.h
index 09a4ef036f..c8eb17ecec 100644
--- a/lib/runtime/src/ops/attention.h
+++ b/lib/local-execution/src/ops/attention.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_ATTENTION_H
 #define _FLEXFLOW_ATTENTION_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/attention.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
similarity index 87%
rename from lib/runtime/src/ops/batch_matmul.cc
rename to lib/local-execution/src/ops/batch_matmul.cc
index 5f40def699..187e97ecaa 100644
--- a/lib/runtime/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -15,20 +15,14 @@
 
 #include "batch_matmul.h"
 #include "kernels/batch_matmul_kernels.h"
-#include "legion.h"
+#include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/batch_matmul.h"
-#include "task_spec/op_task_signature.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::BatchMatmul;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots {
   A_INPUT, // tensor
   B_INPUT, // tensor
@@ -60,7 +54,7 @@ OpTaskInvocation backward(BatchMatmulAttrs const &attrs) {
   return {BATCHMATMUL_BWD_TASK_ID, bwd};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto a_input = acc.get_tensor<Permissions::RO>(A_INPUT);
   auto b_input = acc.get_tensor<Permissions::RO>(B_INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -91,7 +85,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[BatchMatmul] forward_time = %.2lfms\n",
+                 "[BatchMatmul] forward_time = {:.2lf}ms\n",
                  handle,
                  output.get_float_ptr(),
                  a_input.get_float_ptr(),
@@ -105,15 +99,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  iter_config.seq_length);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   // BatchMatmul* bmm = (BatchMatmul*) task->args;
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
@@ -151,7 +138,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[BatchMatmul] backward_time = %.2lfms\n",
+                 "[BatchMatmul] backward_time = {:.2lf}ms\n",
                  handle,
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
@@ -165,14 +152,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  batch);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   BatchMatmulAttrs const &attrs,
                                   InputParallelTensorDesc const &a_input,
@@ -225,7 +204,7 @@ void register_task<BATCHMATMUL_FWD_TASK_ID>() {
   register_task(BATCHMATMUL_FWD_TASK_ID,
                 "BatchMatmul Fwd",
                 fwd_signature<BATCHMATMUL_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -241,7 +220,7 @@ void register_task<BATCHMATMUL_BWD_TASK_ID>() {
   register_task(BATCHMATMUL_BWD_TASK_ID,
                 "BatchMatmul Bwd",
                 bwd_signature<BATCHMATMUL_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h
similarity index 84%
rename from lib/runtime/src/ops/batch_matmul.h
rename to lib/local-execution/src/ops/batch_matmul.h
index 7d3f2308da..94457c22be 100644
--- a/lib/runtime/src/ops/batch_matmul.h
+++ b/lib/local-execution/src/ops/batch_matmul.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_BATCH_MATMUL_H
 #define _FLEXFLOW_BATCH_MATMUL_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/op_task_signature.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/batch_matmul.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
-#include "task_spec/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
similarity index 81%
rename from lib/runtime/src/ops/batch_norm.cc
rename to lib/local-execution/src/ops/batch_norm.cc
index a52981a8a3..97830f90fe 100644
--- a/lib/runtime/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -15,17 +15,11 @@
 
 #include "batch_norm.h"
 #include "kernels/batch_norm_kernels.h"
-#include "legion/legion_utilities.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::BatchNorm;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots {
   INPUT,  // tensor
   SCALE,  // tensor
@@ -88,29 +82,19 @@ static DeviceSpecific<BatchNormPerDeviceState>
   float *runningMean;
 
   DeviceSpecific<BatchNormPerDeviceState> per_device_state =
-      acc.create_device_specific<BatchNormPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      runningMean,
-                      output_n,
-                      output_c,
-                      output_h,
-                      output_w,
-                      attrs.relu));
+      init_kernel(handle,
+                  allocator,
+                  runningMean,
+                  output_n,
+                  output_c,
+                  output_h,
+                  output_w,
+                  attrs.relu);
 
   return per_device_state;
 }
 
-static DeviceSpecific<BatchNormPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -122,23 +106,16 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[BatchNorm] forward_time = %.2lfms\n",
-                 &per_device_state,
+                 "[BatchNorm] forward_time = {:.2lf}ms\n",
+                 per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  scale.get_float_ptr(),
                  bias.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -153,8 +130,8 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[BatchNorm] backward_time = %.2lfms\n",
-                 &per_device_state,
+                 "[BatchNorm] backward_time = {:.2lf}ms\n",
+                 per_device_state,
                  input.get_float_ptr(),
                  output_grad.get_float_ptr(),
                  output.get_float_ptr(),
@@ -165,14 +142,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output.shape.get_volume());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   BatchNormAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -221,6 +190,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 template <>
 OpTaskSignature init_signature<BATCHNORM_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
+
   init.add_input_slot(INPUT);
   init.add_input_slot(BIAS);
   init.add_output_slot(OUTPUT);
@@ -236,7 +206,7 @@ void register_task<BATCHNORM_INIT_TASK_ID>() {
   register_task(BATCHNORM_INIT_TASK_ID,
                 "BatchNorm Init",
                 init_signature<BATCHNORM_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -258,7 +228,7 @@ void register_task<BATCHNORM_FWD_TASK_ID>() {
   register_task(BATCHNORM_FWD_TASK_ID,
                 "BatchNorm Fwd",
                 fwd_signature<BATCHNORM_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -274,7 +244,7 @@ void register_task<BATCHNORM_BWD_TASK_ID>() {
   register_task(BATCHNORM_BWD_TASK_ID,
                 "BatchNorm Bwd",
                 bwd_signature<BATCHNORM_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/batch_norm.h b/lib/local-execution/src/ops/batch_norm.h
similarity index 89%
rename from lib/runtime/src/ops/batch_norm.h
rename to lib/local-execution/src/ops/batch_norm.h
index 906e85a57c..1745a5cac8 100644
--- a/lib/runtime/src/ops/batch_norm.h
+++ b/lib/local-execution/src/ops/batch_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_BATCH_NORM_H
 #define _FLEXFLOW_BATCH_NORM_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/batch_norm.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
similarity index 79%
rename from lib/runtime/src/ops/cast.cc
rename to lib/local-execution/src/ops/cast.cc
index 44230eaf46..9e1f777d73 100644
--- a/lib/runtime/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -15,17 +15,12 @@
 
 #include "cast.h"
 #include "kernels/cast_kernels.h"
-#include "legion/legion_utilities.h"
-#include "task_spec/op_task_signature.h"
+
+#include "local-execution/op_task_signature.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Cast;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 namespace FlexFlow {
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
@@ -48,7 +43,7 @@ OpTaskInvocation backward(CastAttrs const &attrs) {
   return {CAST_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
 
@@ -57,22 +52,15 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Cast] forward_time = %.2lfms\n",
+                 "[Cast] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  input.data_type,
                  attrs.dtype);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
 
@@ -83,21 +71,13 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Cast] forward_time = %.2lfms\n",
+                 "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad,
                  input.data_type,
                  attrs.dtype);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   CastAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -143,7 +123,7 @@ void register_task<CAST_FWD_TASK_ID>() {
   register_task(CAST_FWD_TASK_ID,
                 "Cast Fwd",
                 fwd_signature<CAST_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -158,7 +138,7 @@ void register_task<CAST_BWD_TASK_ID>() {
   register_task(CAST_BWD_TASK_ID,
                 "Cast Bwd",
                 bwd_signature<CAST_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/cast.h b/lib/local-execution/src/ops/cast.h
similarity index 93%
rename from lib/runtime/src/ops/cast.h
rename to lib/local-execution/src/ops/cast.h
index c0c500e869..69aeadf497 100644
--- a/lib/runtime/src/ops/cast.h
+++ b/lib/local-execution/src/ops/cast.h
@@ -15,9 +15,9 @@
 #ifndef _FLEXFLOW_CAST_H
 #define _FLEXFLOW_CAST_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/cast.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
similarity index 75%
rename from lib/runtime/src/ops/combine.cc
rename to lib/local-execution/src/ops/combine.cc
index 46d5ebb4fe..6df09b53f4 100644
--- a/lib/runtime/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -15,15 +15,11 @@
 
 #include "combine.h"
 #include "kernels/combine_kernels.h"
-#include "task_spec/op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 using namespace FlexFlow::Kernels::Combine;
 
@@ -46,7 +42,7 @@ OpTaskInvocation backward(CombineAttrs const &attrs) {
   return {COMBINE_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -54,20 +50,13 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Combine] forward_time = %.2lfms\n",
+                 "[Combine] forward_time = {:.2lf}ms\n",
                  input,
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
@@ -75,19 +64,11 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Combine] forward_time = %.2lfms\n",
+                 "[Combine] backward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   CombineAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -117,7 +98,7 @@ void register_task<COMBINE_FWD_TASK_ID>() {
   register_task(COMBINE_FWD_TASK_ID,
                 "Combine Fwd",
                 fwd_signature<COMBINE_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -133,7 +114,7 @@ void register_task<COMBINE_BWD_TASK_ID>() {
   register_task(COMBINE_BWD_TASK_ID,
                 "Combine Bwd",
                 bwd_signature<COMBINE_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/combine.h b/lib/local-execution/src/ops/combine.h
similarity index 87%
rename from lib/runtime/src/ops/combine.h
rename to lib/local-execution/src/ops/combine.h
index 6b3a43863b..f9349a01ef 100644
--- a/lib/runtime/src/ops/combine.h
+++ b/lib/local-execution/src/ops/combine.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_COMBINE_H
 #define _FLEXFLOW_COMBINE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/combine.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
similarity index 78%
rename from lib/runtime/src/ops/concat.cc
rename to lib/local-execution/src/ops/concat.cc
index 1ce549cc57..f3c2eba48f 100644
--- a/lib/runtime/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -15,21 +15,16 @@
 
 #include "concat.h"
 #include "kernels/concat_kernels.h"
-#include "legion/legion_utilities.h"
+
+#include "local-execution/op_task_signature.h"
+#include "local-execution/variadic_tensor_ref.h"
 #include "op-attrs/get_output_shapes.h"
-#include "task_spec/op_task_signature.h"
-#include "task_spec/variadic_tensor_ref.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Concat;
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS };
 
 OpTaskInvocation forward(ConcatAttrs const &attrs) {
@@ -48,7 +43,7 @@ OpTaskInvocation backward(ConcatAttrs const &attrs) {
   return {CONCAT_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
 
@@ -59,21 +54,14 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Concat] forward_time = %.2lfms\n",
+                 "[Concat] forward_time = {:.2lf}ms\n",
                  output,
                  inputs,
                  attrs.axis);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
 
@@ -84,20 +72,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Concat] backward_time = %.2lfms\n",
+                 "[Concat] backward_time = {:.2lf}ms\n",
                  output_grad,
                  input_grads,
                  attrs.axis);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics
     measure_operator_cost(SimEnvFactory const &sim,
                           ConcatAttrs const &attrs,
@@ -132,6 +112,7 @@ CostMetrics
 template <>
 OpTaskSignature fwd_signature<CONCAT_FWD_TASK_ID>() {
   OpTaskSignature fwd(OpTaskType::FWD);
+
   fwd.add_arg_slot<ConcatAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_input_slot(INPUTS, SlotType::VARIADIC);
@@ -145,13 +126,13 @@ void register_task<CONCAT_FWD_TASK_ID>() {
   register_task(CONCAT_FWD_TASK_ID,
                 "Concat Fwd",
                 fwd_signature<CONCAT_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
 OpTaskSignature bwd_signature<CONCAT_BWD_TASK_ID>() {
   OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(CONCAT_FWD_TASK_ID));
+      infer_bwd_signature(fwd_signature<CONCAT_FWD_TASK_ID>());
 
   return bwd;
 }
@@ -161,7 +142,7 @@ void register_task<CONCAT_BWD_TASK_ID>() {
   register_task(CONCAT_BWD_TASK_ID,
                 "Concat Bwd",
                 bwd_signature<CONCAT_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/concat.h b/lib/local-execution/src/ops/concat.h
similarity index 89%
rename from lib/runtime/src/ops/concat.h
rename to lib/local-execution/src/ops/concat.h
index 27dec47743..fa61d87e77 100644
--- a/lib/runtime/src/ops/concat.h
+++ b/lib/local-execution/src/ops/concat.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONCAT_H
 #define _FLEXFLOW_CONCAT_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/concat.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
similarity index 78%
rename from lib/runtime/src/ops/conv_2d.cc
rename to lib/local-execution/src/ops/conv_2d.cc
index 01d8abab55..eef4c21a45 100644
--- a/lib/runtime/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -1,17 +1,10 @@
 #include "conv_2d.h"
 #include "kernels/conv_2d_kernels.h"
-#include "legion/legion_utilities.h"
-#include "mpark/variant.hpp"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::Conv2D;
 
 enum Slots {
@@ -70,33 +63,23 @@ static DeviceSpecific<Conv2DPerDeviceState>
   auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
 
   DeviceSpecific<Conv2DPerDeviceState> per_device_state =
-      acc.create_device_specific<Conv2DPerDeviceState>(
-          init_kernel(handle,
-                      attrs.activation,
-                      attrs.kernel_h,
-                      attrs.kernel_w,
-                      attrs.groups,
-                      attrs.padding_h,
-                      attrs.padding_w,
-                      attrs.stride_h,
-                      attrs.stride_w,
-                      input,
-                      output,
-                      filter.get_float_ptr(),
-                      filter_grad.get_float_ptr()));
+      init_kernel(handle,
+                  attrs.activation,
+                  attrs.kernel_h,
+                  attrs.kernel_w,
+                  attrs.groups,
+                  attrs.padding_h,
+                  attrs.padding_w,
+                  attrs.stride_h,
+                  attrs.stride_w,
+                  input,
+                  output,
+                  filter.get_float_ptr(),
+                  filter_grad.get_float_ptr());
   return per_device_state;
 }
 
-static DeviceSpecific<Conv2DPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
@@ -109,7 +92,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Conv2d] forward_time = %.2lfms\n",
+                 "[Conv2d] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
@@ -118,15 +101,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.activation);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
@@ -143,7 +119,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Conv2d] backward_time = %.2lfms\n",
+                 "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  input_grad.get_float_ptr(),
@@ -155,14 +131,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.activation);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   Conv2DAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -228,7 +196,7 @@ void register_task<CONV2D_INIT_TASK_ID>() {
   register_task(CONV2D_INIT_TASK_ID,
                 "Conv2d Init",
                 init_signature<CONV2D_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -252,7 +220,7 @@ void register_task<CONV2D_FWD_TASK_ID>() {
   register_task(CONV2D_FWD_TASK_ID,
                 "Conv2d Fwd",
                 fwd_signature<CONV2D_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -268,7 +236,7 @@ void register_task<CONV2D_BWD_TASK_ID>() {
   register_task(CONV2D_BWD_TASK_ID,
                 "Conv2d Bwd",
                 bwd_signature<CONV2D_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/local-execution/src/ops/conv_2d.h
similarity index 89%
rename from lib/runtime/src/ops/conv_2d.h
rename to lib/local-execution/src/ops/conv_2d.h
index 7225099a47..0c8181adce 100644
--- a/lib/runtime/src/ops/conv_2d.h
+++ b/lib/local-execution/src/ops/conv_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONV_2D_H
 #define _FLEXFLOW_CONV_2D_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/conv_2d.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
similarity index 77%
rename from lib/runtime/src/ops/dropout.cc
rename to lib/local-execution/src/ops/dropout.cc
index fe85afea38..9d680054ea 100644
--- a/lib/runtime/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -1,18 +1,12 @@
 #include "dropout.h"
 #include "kernels/dropout_kernels.h"
-#include "legion/legion_utilities.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
-#include "task_spec/op_task_invocation.h"
-#include "task_spec/task_signature.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::Dropout;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING };
@@ -54,21 +48,11 @@ static DeviceSpecific<DropoutPerDeviceState>
   auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
 
   DeviceSpecific<DropoutPerDeviceState> per_device_state =
-      acc.create_device_specific<DropoutPerDeviceState>(
-          init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator));
+      init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator);
   return per_device_state;
 }
 
-static DeviceSpecific<DropoutPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -77,21 +61,14 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Dropout] forward_time = %.2lfms\n",
+                 "[Dropout] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
   auto per_device_state =
       acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
@@ -102,20 +79,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Dropout] backward_time = %.2lfms\n",
+                 "[Dropout] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   DropoutAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -156,6 +125,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 template <>
 OpTaskSignature init_signature<DROPOUT_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
+
   init.add_arg_slot<DropoutAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(FF_HANDLE);
   init.add_output_slot(OUTPUT);
@@ -170,7 +140,7 @@ void register_task<DROPOUT_INIT_TASK_ID>() {
   register_task(DROPOUT_INIT_TASK_ID,
                 "Dropout Init",
                 init_signature<DROPOUT_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -191,7 +161,7 @@ void register_task<DROPOUT_FWD_TASK_ID>() {
   register_task(DROPOUT_FWD_TASK_ID,
                 "Dropout Fwd",
                 fwd_signature<DROPOUT_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -207,7 +177,7 @@ void register_task<DROPOUT_BWD_TASK_ID>() {
   register_task(DROPOUT_BWD_TASK_ID,
                 "Dropout Bwd",
                 bwd_signature<DROPOUT_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/dropout.h b/lib/local-execution/src/ops/dropout.h
similarity index 85%
rename from lib/runtime/src/ops/dropout.h
rename to lib/local-execution/src/ops/dropout.h
index 88a255d140..53fbeb3857 100644
--- a/lib/runtime/src/ops/dropout.h
+++ b/lib/local-execution/src/ops/dropout.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_DROPOUT_H
 #define _FLEXFLOW_DROPOUT_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
+#include "local-execution/tasks.h"
 #include "op-attrs/ops/dropout.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
-#include "tasks.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc
similarity index 81%
rename from lib/runtime/src/ops/element_binary.cc
rename to lib/local-execution/src/ops/element_binary.cc
index f6be2198ca..a2e9ee2ba8 100644
--- a/lib/runtime/src/ops/element_binary.cc
+++ b/lib/local-execution/src/ops/element_binary.cc
@@ -1,16 +1,11 @@
 #include "element_binary.h"
 #include "kernels/element_binary_kernels.h"
-#include "legion/legion_utilities.h"
+
 #include "op-attrs/get_output_shapes.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::ElementBinary;
 
 enum Slots {
@@ -66,27 +61,17 @@ static DeviceSpecific<ElementBinaryPerDeviceState>
   auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
 
   DeviceSpecific<ElementBinaryPerDeviceState> per_device_state =
-      acc.create_device_specific<ElementBinaryPerDeviceState>(
-          init_kernel(handle,
-                      attrs.type,
-                      attrs.should_broadcast_lhs,
-                      attrs.should_broadcast_rhs,
-                      input_lhs.shape,
-                      input_rhs.shape,
-                      output.shape));
+      init_kernel(handle,
+                  attrs.type,
+                  attrs.should_broadcast_lhs,
+                  attrs.should_broadcast_rhs,
+                  input_lhs.shape,
+                  input_rhs.shape,
+                  output.shape);
   return per_device_state;
 }
 
-static DeviceSpecific<ElementBinaryPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
@@ -99,7 +84,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[ElementBinary] forward_time = %.2lfms\n",
+                 "[ElementBinary] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input_lhs.get_float_ptr(),
                  input_rhs.get_float_ptr(),
@@ -109,15 +94,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  handle);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -133,7 +111,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[ElementBinary] backward_time = %.2lfms\n",
+                 "[ElementBinary] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
                  input_lhs.get_float_ptr(),
@@ -146,14 +124,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  handle);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics
     measure_operator_cost(SimEnvFactory const &sim,
                           ElementBinaryAttrs const &attrs,
@@ -221,7 +191,7 @@ void register_task<ELEMENTBINARY_INIT_TASK_ID>() {
   register_task(ELEMENTBINARY_INIT_TASK_ID,
                 "ElementBinary Init",
                 init_signature<ELEMENTBINARY_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -245,7 +215,7 @@ void register_task<ELEMENTBINARY_FWD_TASK_ID>() {
   register_task(ELEMENTBINARY_FWD_TASK_ID,
                 "ElementBinary Fwd",
                 fwd_signature<ELEMENTBINARY_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -261,7 +231,7 @@ void register_task<ELEMENTBINARY_BWD_TASK_ID>() {
   register_task(ELEMENTBINARY_BWD_TASK_ID,
                 "ElementBinary Bwd",
                 bwd_signature<ELEMENTBINARY_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/element_binary.h b/lib/local-execution/src/ops/element_binary.h
similarity index 95%
rename from lib/runtime/src/ops/element_binary.h
rename to lib/local-execution/src/ops/element_binary.h
index 342909c468..fa4202dffd 100644
--- a/lib/runtime/src/ops/element_binary.h
+++ b/lib/local-execution/src/ops/element_binary.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_ELEMENT_BINARY_H
 #define _FLEXFLOW_ELEMENT_BINARY_H
 
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/element_binary.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
similarity index 76%
rename from lib/runtime/src/ops/element_unary.cc
rename to lib/local-execution/src/ops/element_unary.cc
index f41a8b3551..2ad5d797f5 100644
--- a/lib/runtime/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -1,15 +1,11 @@
 #include "element_unary.h"
 #include "kernels/element_unary_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
 // declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 using namespace FlexFlow::Kernels::ElementUnary;
 
@@ -27,7 +23,6 @@ enum Slots {
 OpTaskInvocation init(ElementUnaryUnifiedAttrs const &attrs) {
   OpTaskBinding b;
 
-  b.bind_arg(HANDLE, ff_handle());
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0));
 
@@ -58,32 +53,21 @@ static DeviceSpecific<ElementUnaryPerDeviceState>
 
   auto const &attrs = acc.get_argument<ElementUnaryUnifiedAttrs>(ATTRS);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   ParallelTensorShape input_shape =
       acc.get_argument<ParallelTensorShape>(INPUT_SHAPE);
   ParallelTensorShape output_shape = get_output_shape(attrs, input_shape);
 
-  DeviceSpecific<ElementUnaryPerDeviceState> per_device_state =
-      acc.create_device_specific<ElementUnaryPerDeviceState>(
-          init_kernel(input_shape, output_shape, attrs));
+  DeviceSpecific<ElementUnaryPerDeviceState> per_device_state = init_kernel(
+      get_piece_shape(input_shape), get_piece_shape(output_shape), attrs);
   return per_device_state;
 }
 
-static DeviceSpecific<ElementUnaryPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<ElementUnaryUnifiedAttrs>(ATTRS);
 
-  auto &handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
@@ -91,7 +75,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[ElementUnary] forward_time = %.2lfms\n",
+                 "[ElementUnary] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  attrs,
                  handle,
@@ -99,22 +83,15 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryUnifiedAttrs>(ATTRS);
-  auto &handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   auto per_device_state =
       acc.get_argument<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
@@ -122,7 +99,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[ElementUnary] backward_time = %.2lfms\n",
+                 "[ElementUnary] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  attrs,
                  handle,
@@ -132,14 +109,6 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   ElementUnaryUnifiedAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -147,7 +116,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   MachineView const &mv) {
   auto env = sim.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shape(attrs, input_shape);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(HANDLE, ff_handle());
@@ -182,6 +151,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim,
 template <>
 OpTaskSignature init_signature<ELEMENTUNARY_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
+
   init.add_arg_slot<ParallelTensorShape>(INPUT_SHAPE);
   init.add_arg_slot<ElementUnaryUnifiedAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
@@ -196,7 +166,7 @@ void register_task<ELEMENTUNARY_INIT_TASK_ID>() {
   register_task(ELEMENTUNARY_INIT_TASK_ID,
                 "ElementUnary Init",
                 init_signature<ELEMENTUNARY_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -217,7 +187,7 @@ void register_task<ELEMENTUNARY_FWD_TASK_ID>() {
   register_task(ELEMENTUNARY_FWD_TASK_ID,
                 "ElementUnary Fwd",
                 fwd_signature<ELEMENTUNARY_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -233,7 +203,7 @@ void register_task<ELEMENTUNARY_BWD_TASK_ID>() {
   register_task(ELEMENTUNARY_BWD_TASK_ID,
                 "ElementUnary Bwd",
                 bwd_signature<ELEMENTUNARY_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/element_unary.h b/lib/local-execution/src/ops/element_unary.h
similarity index 85%
rename from lib/runtime/src/ops/element_unary.h
rename to lib/local-execution/src/ops/element_unary.h
index f44efc28db..e0f58e8a75 100644
--- a/lib/runtime/src/ops/element_unary.h
+++ b/lib/local-execution/src/ops/element_unary.h
@@ -1,15 +1,12 @@
 #ifndef _ELEMENT_UNARY_H
 #define _ELEMENT_UNARY_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/element_unary.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
-using ElementUnaryUnifiedAttrs =
-    variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;
-
 template <>
 void register_task<ELEMENTUNARY_INIT_TASK_ID>();
 template <>
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc
similarity index 81%
rename from lib/runtime/src/ops/embedding.cc
rename to lib/local-execution/src/ops/embedding.cc
index a1bc915d2f..00d6d033d4 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/local-execution/src/ops/embedding.cc
@@ -15,17 +15,11 @@
 
 #include "embedding.h"
 #include "kernels/embedding_kernels.h"
-#include "legion.h"
+#include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/embedding.h"
 
 namespace FlexFlow {
 
-// declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 using namespace FlexFlow::Kernels::Embedding;
 
 enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING };
@@ -49,7 +43,7 @@ OpTaskInvocation backward(EmbeddingAttrs const &attrs) {
   return {EMBED_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -59,7 +53,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Embedding] forward_time = %.2lfms\n",
+                 "[Embedding] forward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight,
@@ -71,15 +65,8 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  input.shape[legion_dim_t(1)]);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
@@ -89,7 +76,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Embedding] forward_time = %.2lfms\n",
+                 "[Embedding] backward_time = {:.2lf}ms\n",
                  input,
                  output,
                  weight_grad,
@@ -98,15 +85,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
-                 input.shape[ff_dim_t(0)]);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 input.shape.at(ff_dim_t(0)));
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
@@ -158,7 +137,7 @@ void register_task<EMBED_FWD_TASK_ID>() {
   register_task(EMBED_FWD_TASK_ID,
                 "Embed Fwd",
                 fwd_signature<EMBED_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -172,7 +151,7 @@ void register_task<EMBED_BWD_TASK_ID>() {
   register_task(EMBED_BWD_TASK_ID,
                 "Embed Bwd",
                 bwd_signature<EMBED_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/embedding.h b/lib/local-execution/src/ops/embedding.h
similarity index 88%
rename from lib/runtime/src/ops/embedding.h
rename to lib/local-execution/src/ops/embedding.h
index cd1b14fa66..c33b1161bf 100644
--- a/lib/runtime/src/ops/embedding.h
+++ b/lib/local-execution/src/ops/embedding.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_EMBEDDING_H
 #define _FLEXFLOW_EMBEDDING_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/embedding.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
similarity index 76%
rename from lib/runtime/src/ops/flat.cc
rename to lib/local-execution/src/ops/flat.cc
index f53a6185b6..3c2499da79 100644
--- a/lib/runtime/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -5,10 +5,6 @@
 namespace FlexFlow {
 
 // declare Legion names
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
 
 using namespace FlexFlow::Kernels::Flat;
 
@@ -30,27 +26,20 @@ OpTaskInvocation backward(FlatAttrs const &attrs) {
   return {FLAT_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
-                 "[Flat] forward_time = %.2lfms\n",
+                 "[Flat] forward_time = {:.2lf}ms\n",
                  input,
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -59,20 +48,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Flat] forward_time = %.2lfms\n",
+                 "[Flat] backward_time = {:.2lf}ms\n",
                  input,
                  input_grad.get_float_ptr(),
                  output_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim,
                                   FlatAttrs const &attrs,
                                   InputParallelTensorDesc const &input_shape,
@@ -115,7 +96,7 @@ void register_task<FLAT_FWD_TASK_ID>() {
   register_task(FLAT_FWD_TASK_ID,
                 "Flat Fwd",
                 fwd_signature<FLAT_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -130,7 +111,7 @@ void register_task<FLAT_BWD_TASK_ID>() {
   register_task(FLAT_BWD_TASK_ID,
                 "Flat Bwd",
                 bwd_signature<FLAT_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/flat.h b/lib/local-execution/src/ops/flat.h
similarity index 93%
rename from lib/runtime/src/ops/flat.h
rename to lib/local-execution/src/ops/flat.h
index 13246028fb..d9ea4d3985 100644
--- a/lib/runtime/src/ops/flat.h
+++ b/lib/local-execution/src/ops/flat.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_FLAT_H
 #define _FLEXFLOW_FLAT_H
 
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/flat.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
new file mode 100644
index 0000000000..50b27d72a6
--- /dev/null
+++ b/lib/local-execution/src/ops/gather.cc
@@ -0,0 +1,215 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gather.h"
+#include "kernels/gather_kernels.h"
+#include "local-execution/legion_tensor_shape.h"
+#include "op-attrs/get_output_shapes.h"
+#include <optional>
+
+namespace FlexFlow {
+
+using namespace FlexFlow::Kernels::Gather;
+
+enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE };
+
+OpTaskInvocation init(GatherAttrs const &attrs) {
+  OpTaskBinding binding;
+
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(INDEX, input_tensor(1));
+  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
+
+  return {GATHER_INIT_TASK_ID, binding};
+}
+
+OpTaskInvocation forward(GatherAttrs const &attrs) {
+  OpTaskBinding binding;
+
+  binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<GatherPerDeviceState>());
+
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INDEX, weight_tensor(0));
+
+  return {GATHER_FWD_TASK_ID, binding};
+}
+
+OpTaskInvocation backward(GatherAttrs const &attrs) {
+  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
+
+  return {GATHER_BWD_TASK_ID, binding};
+}
+
+static DeviceSpecific<GatherPerDeviceState>
+    init_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto index = acc.get_tensor<Permissions::RO>(INDEX);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
+  legion_dim_t legion_dim = to_legion(attrs.dim, input.shape.num_dims());
+
+  assert(input.shape.get_dim() == index.shape.get_dim());
+  assert(output.shape.get_dim() == index.shape.get_dim());
+
+  for (int i = 0; i < input.shape.get_dim(); i++) {
+    assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]);
+    if (i != legion_dim.value()) {
+      assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]);
+    }
+  }
+
+  return DeviceSpecific<GatherPerDeviceState>({handle, legion_dim});
+}
+
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto per_device_state =
+      acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
+
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto index = acc.get_tensor<Permissions::RO>(INDEX);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[Gather] forward_time = {:.2lf}ms\n",
+                 per_device_state,
+                 input,
+                 index,
+                 output);
+}
+
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto per_device_state =
+      acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
+
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto index = acc.get_tensor<Permissions::RO>(INDEX);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+
+  return profile(backward_kernel,
+                 profiling,
+                 "[Gather] backward_time = {:.2lf}ms\n",
+                 per_device_state,
+                 output_grad,
+                 index,
+                 input_grad);
+}
+
+CostMetrics measure_operator_cost(SimEnvFactory const &sim,
+                                  GatherAttrs const &attrs,
+                                  InputParallelTensorDesc const &input_shape,
+                                  InputParallelTensorDesc const &index_shape,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &mv) {
+
+  auto env = sim.new_environment();
+
+  std::vector<ParallelTensorShape> output_shape =
+      get_output_shapes(attrs, input_shape.shape, index_shape.shape);
+
+  SimTaskBinding fwd_binding;
+  fwd_binding.bind_arg(PROFILING, settings);
+  fwd_binding.bind_arg(ATTRS, attrs);
+
+  fwd_binding.bind(INPUT, input_shape);
+  fwd_binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind(INDEX, index_shape);
+
+  SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+
+  auto fwd_accessor = env.get_fwd_accessor(GATHER_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(GATHER_BWD_TASK_ID, bwd_binding);
+
+  float forward_time = forward_task_impl(fwd_accessor).value();
+  float backward_time = backward_task_impl(bwd_accessor).value();
+
+  float sync_time = default_estimate_sync_time(env);
+  return make_metrics(forward_time, backward_time, sync_time, env);
+}
+
+template <>
+OpTaskSignature init_signature<GATHER_INIT_TASK_ID>() {
+  OpTaskSignature init(OpTaskType::INIT);
+
+  init.add_input_slot(INPUT);
+  init.add_input_slot(INDEX);
+  init.add_output_slot(OUTPUT);
+
+  init.add_arg_slot<GatherAttrs>(ATTRS);
+  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+
+  init.add_return_value<GatherPerDeviceState>();
+
+  return init;
+}
+
+template <>
+void register_task<GATHER_INIT_TASK_ID>() {
+  register_task(GATHER_INIT_TASK_ID,
+                "Gather Init",
+                init_signature<GATHER_INIT_TASK_ID>(),
+                init_task_impl);
+}
+
+template <>
+OpTaskSignature fwd_signature<GATHER_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FWD);
+
+  fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_arg_slot<GatherAttrs>(ATTRS);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+  fwd.add_weight_slot(INDEX);
+
+  return fwd;
+}
+
+template <>
+void register_task<GATHER_FWD_TASK_ID>() {
+  register_task(GATHER_FWD_TASK_ID,
+                "Gather Fwd",
+                fwd_signature<GATHER_FWD_TASK_ID>(),
+                forward_task_impl);
+}
+
+template <>
+OpTaskSignature bwd_signature<GATHER_BWD_TASK_ID>() {
+  OpTaskSignature bwd =
+      infer_bwd_signature(fwd_signature<GATHER_FWD_TASK_ID>());
+
+  return bwd;
+}
+
+template <>
+void register_task<GATHER_BWD_TASK_ID>() {
+  register_task(GATHER_BWD_TASK_ID,
+                "Gather Bwd",
+                bwd_signature<GATHER_BWD_TASK_ID>(),
+                backward_task_impl);
+}
+
+}; // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/src/ops/gather.h
new file mode 100644
index 0000000000..e2de09d96a
--- /dev/null
+++ b/lib/local-execution/src/ops/gather.h
@@ -0,0 +1,30 @@
+#ifndef _FLEXFLOW_GATHER_H
+#define _FLEXFLOW_GATHER_H
+
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
+#include "op-attrs/ops/gather.h"
+
+namespace FlexFlow {
+
+template <>
+void register_task<GATHER_INIT_TASK_ID>();
+template <>
+void register_task<GATHER_FWD_TASK_ID>();
+template <>
+void register_task<GATHER_BWD_TASK_ID>();
+
+OpTaskInvocation init(GatherAttrs const &);
+OpTaskInvocation forward(GatherAttrs const &);
+OpTaskInvocation backward(GatherAttrs const &);
+
+CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
+                                  GatherAttrs const &attrs,
+                                  InputParallelTensorDesc const &input,
+                                  InputParallelTensorDesc const &index,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
similarity index 61%
rename from lib/runtime/src/ops/layer_norm.cc
rename to lib/local-execution/src/ops/layer_norm.cc
index 6bc671c249..620758772c 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -15,20 +15,17 @@
 
 #include "layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include <type_traits>
 
-using Legion::Context;
-using Legion::PhysicalRegion;
-using Legion::Runtime;
-using Legion::Task;
-
 namespace FlexFlow {
 
+using namespace FlexFlow::Kernels::LayerNorm;
+
 enum Slots {
   PROFILING,
   INPUT,
@@ -59,7 +56,7 @@ OpTaskInvocation forward(LayerNormAttrs const &attrs) {
   b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem
   b.bind(BETA, weight_tensor(1));  // how to get gmmam and beta
   b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE, per_device_state<LayerNormPerDeviceState>());
+  b.bind_arg(PER_DEVICE_STATE, per_device_op_state<LayerNormPerDeviceState>());
 
   return {LAYERNORM_FWD_TASK_ID, b};
 }
@@ -70,71 +67,56 @@ OpTaskInvocation backward(LayerNormAttrs const &attrs) {
   return {LAYERNORM_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
-  auto gamma = acc.get_tensor<Permission::RW>(GAMMA);
-  auto beta = acc.get_tensor<Permission::RW>(BETA);
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto gamma = acc.get_tensor<Permissions::RW>(GAMMA);
+  auto beta = acc.get_tensor<Permissions::RW>(BETA);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(forward_kernel,
                  profiling,
-                 "[LayerNorm] forward time = %.2lfms\n",
+                 "[LayerNorm] forward time = {:.2lf}ms\n",
                  state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 gamma.get_float_ptr(),
-                 beta.get_float_ptr());
+                 input,
+                 output,
+                 gamma,
+                 beta);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto gamma = acc.get_tensor<Permissions::RO>(GAMMA);
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto gamma = acc.get_tensor<Permission::RO>(GAMMA);
-
-  auto input_grad = acc.get_tensor<Permission::RW>(INPUT_GRAD);
-  auto gamma_grad = acc.get_tensor<Permission::RW>(GAMMA_GRAD);
-  auto beta_grad = acc.get_tensor<Permission::RW>(BETA_GRAD);
-  auto output_grad = acc.get_tensor<Permission::RO>(OUTPUT_GRAD);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
+  auto gamma_grad = acc.get_tensor_grad<Permissions::RW>(GAMMA);
+  auto beta_grad = acc.get_tensor_grad<Permissions::RW>(BETA);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(backward_kernel,
                  profiling,
-                 "[LayerNorm] backward time = %.2lfms\n",
+                 "[LayerNorm] backward time = {:.2lf}ms\n",
                  state,
-                 output_grad.get_float_ptr(),
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 gamma.get_float_ptr(),
-                 gamma_grad.get_float_ptr(),
-                 beta_grad.get_float_ptr());
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 output_grad,
+                 input,
+                 input_grad,
+                 gamma,
+                 gamma_grad,
+                 beta_grad);
 }
 
 static DeviceSpecific<LayerNormPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
+  auto const &attrs = acc.get_argument<LayerNormAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  FFHandler handle = acc.get_argument<FFHandler>(HANDLE);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   // question: how to get batch_size and effective_num_elements
   int64_t effective_batch_size, effective_num_elements;
@@ -143,29 +125,20 @@ static DeviceSpecific<LayerNormPerDeviceState>
     M *= input.shape.at(legion_dim_t(attrs.axes[i]));
   }
   int num_replicas = 1;
-  for (int i = 0; i < intput.shape.num_dims(); i++) {
+  for (int i = 0; i < input.shape.num_dims(); i++) {
     num_replicas *= input.shape.at(legion_dim_t(i));
     effective_num_elements = M;
     effective_batch_size = input.shape.get_volume() / M;
-
-    DeviceSpecific<LayerNormPerDeviceState> per_device_state =
-        acc.create_device_specific<LayerNormPerDeviceState>(
-            init_kernel(handle,
-                        allocator,
-                        attrs.elementwise_affine,
-                        effective_batch_size,
-                        effective_num_elements,
-                        attrs.eps));
   }
-}
 
-static DeviceSpecific<LayerNormPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
+  DeviceSpecific<LayerNormPerDeviceState> per_device_state =
+      init_kernel(handle,
+                  allocator,
+                  attrs.elementwise_affine,
+                  effective_batch_size,
+                  effective_num_elements,
+                  attrs.eps);
+  return per_device_state;
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -173,18 +146,19 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(HANDLE, ff_handle());
   init_binding.bind_arg(ATTRS, attrs);
-  init.binding.bind(INPUT, input.shape);
+  init_binding.bind(INPUT, input.shape);
 
   auto init_accessor =
       env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
 
-  DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
+  DeviceSpecific<LayerNormPerDeviceState> per_device_state =
+      init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind(INPUT, input.shape);
@@ -192,9 +166,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   fwd_binding.bind_arg(PROFILING, settings);
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
-  // TODO how to handle gamma and beta, where are they from
-  fwd_binding.bind(GAMMA, input_shape);
-  fwd_binding.bind(BETA, input_shape);
+  fwd_binding.bind(GAMMA, input.shape);
+  fwd_binding.bind(BETA, input.shape);
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
   auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);
@@ -222,7 +195,7 @@ OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
 }
 
 template <>
-OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>() {
+OpTaskSignature bwd_signature<LAYERNORM_BWD_TASK_ID>() {
   OpTaskSignature bwd =
       infer_bwd_signature(fwd_signature<LAYERNORM_FWD_TASK_ID>());
   return bwd;
@@ -231,6 +204,7 @@ OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>() {
 template <>
 OpTaskSignature init_signature<LAYERNORM_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
+
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
@@ -245,7 +219,7 @@ void register_task<LAYERNORM_INIT_TASK_ID>() {
   register_task(LAYERNORM_INIT_TASK_ID,
                 "LayerNorm init",
                 init_signature<LAYERNORM_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -253,15 +227,15 @@ void register_task<LAYERNORM_FWD_TASK_ID>() {
   register_task(LAYERNORM_FWD_TASK_ID,
                 "LayerNorm forward",
                 fwd_signature<LAYERNORM_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
 void register_task<LAYERNORM_BWD_TASK_ID>() {
   register_task(LAYERNORM_BWD_TASK_ID,
                 "LayerNorm backward",
-                bwd_signature<AYERNORM_BWD_TASK_ID>(),
-                backward_task);
+                bwd_signature<LAYERNORM_BWD_TASK_ID>(),
+                backward_task_impl);
 }
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/local-execution/src/ops/layer_norm.h
similarity index 97%
rename from lib/runtime/src/ops/layer_norm.h
rename to lib/local-execution/src/ops/layer_norm.h
index 83e6733bf6..4eadb9ff09 100644
--- a/lib/runtime/src/ops/layer_norm.h
+++ b/lib/local-execution/src/ops/layer_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/layer_norm.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
similarity index 54%
rename from lib/runtime/src/ops/linear.cc
rename to lib/local-execution/src/ops/linear.cc
index 96d037913c..e2c9d9aef4 100644
--- a/lib/runtime/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -1,32 +1,14 @@
 #include "linear.h"
 #include "kernels/linear_kernels.h"
-#include "layer.h"
-#include "legion/legion_utilities.h"
+#include "local-execution/task_argument_accessor.h"
 #include "op-attrs/ff_dim.h"
 #include "op-attrs/get_output_shapes.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/graph/views.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
 using namespace FlexFlow::Kernels::Linear;
 
 enum slots {
@@ -43,12 +25,12 @@ enum slots {
 OpTaskInvocation init(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind_arg(HANDLE, ff_handle());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(ATTRS, attrs);
 
-  bind.bind(INPUT, input_tensor(0));   // input
-  bind.bind(WEIGHT, weight_tensor(0)); // weight
-  bind.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));   // input
+  binding.bind(WEIGHT, weight_tensor(0)); // weight
+  binding.bind(OUTPUT, output_tensor(0)); // output
 
   return {LINEAR_INIT_TASK_ID, binding};
 }
@@ -56,14 +38,17 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
 OpTaskInvocation forward(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind(INPUT, input_tensor(0));   // input
-  bind.bind(WEIGHT, weight_tensor(0)); // weight
-  bind.bind(OUTPUT, output_tensor(0)); // output
-  bind.bind(BIAS, bias_tensor(0));     // bias
+  binding.bind(INPUT, input_tensor(0));   // input
+  binding.bind(WEIGHT, weight_tensor(0)); // weight
+  binding.bind(OUTPUT, output_tensor(0)); // output
+  if (attrs.use_bias) {
+    binding.bind(BIAS, weight_tensor(1)); // bias
+  }
 
-  bing.bind_arg(PROFILING, profiling_settings());
-  bind.bind_arg(PER_DEVICE_STATE, per_device_state<LinearPerDeviceState>());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<LinearPerDeviceState>());
+  binding.bind_arg(ATTRS, attrs);
 
   return {LINEAR_FWD_TASK_ID, binding};
 }
@@ -74,51 +59,38 @@ OpTaskInvocation backward(LinearAttrs const &attrs) {
   return {LINEAR_BWD_TASK_ID, b};
 }
 
-static DeviceSpecific<LinearPerDeviceState>
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
-  Allocator allocator = acc.get_allocator();
+static LinearPerDeviceState init_task_impl(TaskArgumentAccessor const &acc) {
+  auto const &attrs = acc.get_argument<LinearAttrs>(ATTRS);
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   int out_dim = output.shape.at(ff_dim_t{0});
-  int batch_size = output.shape.at.(ff_dim_t{1});
+  int batch_size = output.shape.at(ff_dim_t{1});
 
   float *one_ptr;
 
-  DeviceSpecific<LinearPerDeviceState> state =
-      acc.create_device_specific<LinearPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      one_ptr,
-                      attrs.regularizer,
-                      attrs.use_bias,
-                      input.data_type,
-                      weight.data_type,
-                      output.data_type,
-                      batch_size,
-                      attrs.out_channels));
+  LinearPerDeviceState state = init_kernel(handle,
+                                           one_ptr,
+                                           attrs.regularizer,
+                                           attrs.use_bias,
+                                           input.data_type,
+                                           weight.data_type,
+                                           output.data_type,
+                                           batch_size,
+                                           attrs.out_channels);
   return state;
 }
 
-static DeviceSpecific<MHAPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto bias = acc.get_tensor<Permissions::RO>(BIAS);
 
-  auto state = acc.get_device_specific<LinearPerDeviceState>(PER_DEVICE_STATE);
+  auto per_device_state =
+      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
@@ -133,7 +105,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Linear] forward_time = %.2lfms\n",
+                 "[Linear] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
@@ -144,15 +116,10 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  batch_size);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-};
+;
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -161,7 +128,8 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto per_device_state = acc.get_argument<MHAPerDeviceState>(PER_DEVICE_STATE);
+  auto per_device_state =
+      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
@@ -176,65 +144,63 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(backward_kernel,
                  profiling,
-                 "[Linear] backward_time = %.2lfms\n",
+                 "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 output.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 weight.get_float_ptr(),
-                 weight_grad.get_float_ptr(),
-                 bias_ptr,
+                 (void *)input.get_float_ptr(),
+                 (void *)input_grad.get_float_ptr(),
+                 (void *)output.get_float_ptr(),
+                 (void *)output_grad.get_float_ptr(),
+                 (void *)weight.get_float_ptr(),
+                 (void *)weight_grad.get_float_ptr(),
+                 (void *)bias_ptr,
                  in_dim,
                  out_dim,
                  batch_size);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   LinearAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shape(input.shape, attrs);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+  ParallelTensorShape weight_shape = get_weights_shape(attrs, input.shape);
+  ParallelTensorShape bias_shape = get_bias_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
-  init_binding.bind(INPUT, input_tensor(0));
-  init_binding.bind(WEIGHT, weight_tensor(0));
-  init_binding.bind(BIAS, bias_tensor(0));
-  init_binding.bind(OUTPUT, output_tensor(0));
+  init_binding.bind(INPUT, input.shape);
+  init_binding.bind(WEIGHT, weight_shape);
+  if (attrs.use_bias) {
+    init_binding.bind(BIAS, bias_shape);
+  }
+  init_binding.bind(OUTPUT, output_shape);
   init_binding.bind_arg(ATTRS, attrs);
   init_binding.bind_arg(HANDLE, ff_handle());
 
   auto init_accessor = env.get_init_accessor(LINEAR_INIT_TASK_ID, init_binding);
 
-  DeviceSpecific<LinearPerDeviceState> per_device_state =
-      init_task_impl(init_accessor);
+  LinearPerDeviceState per_device_state = init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
 
-  fwd_bind.bind(INPUT, input_tensor(0));   // input
-  fwd_bind.bind(WEIGHT, weight_tensor(0)); // weight
-  fwd_bind.bind(OUTPUT, output_tensor(0)); // output
-  fwd_bind.bind(BIAS, bias_tensor(0));     // bias
+  fwd_binding.bind(INPUT, input.shape);   // input
+  fwd_binding.bind(WEIGHT, weight_shape); // weight
+  fwd_binding.bind(OUTPUT, output_shape); // output
+  if (attrs.use_bias) {
+    fwd_binding.bind(BIAS, bias_shape); // bias
+  }
 
-  fwd_bid.bind_arg(PROFILING, profiling_settings());
-  fwd_bind.bind_arg(PER_DEVICE_STATE, per_device_state<LinearPerDeviceState>());
-  fwd_bind.bind_arg(ATTRS, attrs);
+  fwd_binding.bind_arg(PROFILING, profiling_settings());
+  fwd_binding.bind_arg(PER_DEVICE_STATE,
+                       per_device_op_state<LinearPerDeviceState>());
+  fwd_binding.bind_arg(ATTRS, attrs);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto fwd_accessor = env.get_accessor(LINEAR_FWD_TASK_ID, fwd_binding);
-  auto bwd_accessor = env.get_accessor(LINEAR_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(LINEAR_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(LINEAR_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();
@@ -248,15 +214,14 @@ OpTaskSignature init_signature<LINEAR_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
 
   init.add_input_slot(INPUT);
-  init.add_input_slot(WEIGHT);
-  init.add_input_slot(BIAS);
+  init.add_weight_slot(WEIGHT);
   init.add_output_slot(OUTPUT);
 
   init.add_arg_slot<LinearAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
   init.add_return_value<LinearPerDeviceState>();
-  return init,
+  return init;
 }
 
 template <>
@@ -264,8 +229,8 @@ OpTaskSignature fwd_signature<LINEAR_FWD_TASK_ID>() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
-  fwd.add_input_slot(WEIGHT);
-  fwd.add_input_slot(BIAS);
+  fwd.add_weight_slot(WEIGHT);
+  fwd.add_optional_weight_slot(BIAS);
   fwd.add_output_slot(OUTPUT);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
@@ -281,13 +246,28 @@ OpTaskSignature bwd_signature<LINEAR_BWD_TASK_ID>() {
   return bwd;
 }
 
+template <>
+TaskImplFunction get_task_impl<LINEAR_INIT_TASK_ID>() {
+  return init_task_impl;
+}
+
+template <>
+TaskImplFunction get_task_impl<LINEAR_FWD_TASK_ID>() {
+  return forward_task_impl;
+}
+
+template <>
+TaskImplFunction get_task_impl<LINEAR_BWD_TASK_ID>() {
+  return backward_task_impl;
+}
+
 template <>
 void register_task<LINEAR_INIT_TASK_ID>() {
 
   register_task(LINEAR_INIT_TASK_ID,
                 "Linear::init_task",
                 init_signature<LINEAR_INIT_TASK_ID>(),
-                init_task);
+                init_task_impl);
 }
 
 template <>
@@ -295,7 +275,7 @@ void register_task<LINEAR_FWD_TASK_ID>() {
   register_task(LINEAR_FWD_TASK_ID,
                 "Linear::fwd_task",
                 fwd_signature<LINEAR_FWD_TASK_ID>(),
-                forward_task);
+                forward_task_impl);
 }
 
 template <>
@@ -303,7 +283,11 @@ void register_task<LINEAR_BWD_TASK_ID>() {
   register_task(LINEAR_BWD_TASK_ID,
                 "Linear::bwd_task",
                 bwd_signature<LINEAR_BWD_TASK_ID>(),
-                backward_task);
+                backward_task_impl);
+}
+
+std::vector<task_id_t> get_task_ids(LinearAttrs const &) {
+  return {LINEAR_INIT_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID};
 }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/linear.h b/lib/local-execution/src/ops/linear.h
similarity index 98%
rename from lib/runtime/src/ops/linear.h
rename to lib/local-execution/src/ops/linear.h
index 2b476382ef..2ff9016114 100644
--- a/lib/runtime/src/ops/linear.h
+++ b/lib/local-execution/src/ops/linear.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LINEAR_H
 #define _FLEXFLOW_LINEAR_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/linear.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc
similarity index 95%
rename from lib/runtime/src/ops/noop.cc
rename to lib/local-execution/src/ops/noop.cc
index 6b8510607a..168d547c17 100644
--- a/lib/runtime/src/ops/noop.cc
+++ b/lib/local-execution/src/ops/noop.cc
@@ -14,7 +14,7 @@
  */
 
 #include "noop.h"
-#include "task_spec/op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/runtime/src/ops/noop.h b/lib/local-execution/src/ops/noop.h
similarity index 87%
rename from lib/runtime/src/ops/noop.h
rename to lib/local-execution/src/ops/noop.h
index f5cf6cc98c..fab2cf1f86 100644
--- a/lib/runtime/src/ops/noop.h
+++ b/lib/local-execution/src/ops/noop.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_NOOP_H
 #define _FLEXFLOW_NOOP_H
 
+#include "local-execution/op_task_invocation.h"
 #include "op-attrs/ops/input.h"
 #include "op-attrs/ops/noop.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/parallel_op.h b/lib/local-execution/src/ops/parallel_op.h
similarity index 96%
rename from lib/runtime/src/ops/parallel_op.h
rename to lib/local-execution/src/ops/parallel_op.h
index 6b596a4fb5..e7bd98b8a8 100644
--- a/lib/runtime/src/ops/parallel_op.h
+++ b/lib/local-execution/src/ops/parallel_op.h
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 struct ParallelOpJoinResult {
-  optional<ParallelOpInfo> op = nullopt;
+  std::optional<ParallelOpInfo> op = std::nullopt;
   bool join_did_succeed = false;
 };
 
diff --git a/lib/runtime/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc
similarity index 61%
rename from lib/runtime/src/ops/partition.cc
rename to lib/local-execution/src/ops/partition.cc
index 2a974e96da..4b09ad026b 100644
--- a/lib/runtime/src/ops/partition.cc
+++ b/lib/local-execution/src/ops/partition.cc
@@ -13,32 +13,13 @@
  * limitations under the License.
  */
 
-#include "parallel_ops/partition.h"
 #include "kernels/partition_kernels.h"
-#include "op-attrs/get_output_shape.h"
-#include "utils/exceptions.h"
+#include "op-attrs/get_output_shapes.h"
+#include "repartition.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Repartition;
 
@@ -59,7 +40,7 @@ OpTaskInvocation forward(RepartitionAttrs const &attrs) {
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_state<RepartitionPerDeviceState>());
+                   per_device_op_state<RepartitionPerDeviceState>());
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
 
@@ -79,64 +60,39 @@ static DeviceSpecific<RepartitionPerDeviceState>
 
   // Note: use the input data type
   DeviceSpecific<RepartitionPerDeviceState> per_device_state =
-      acc.create_device_specific_state<RepartitionPerDeviceState>(
-          init_kernel(handle, input.data_type));
+      init_kernel(handle, input.data_type);
   return per_device_state;
 }
 
-static DeviceSpecific<RepartitionPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  return profiling(forward,
-                   profiling,
-                   "[Reparition/Partition] forward_time = %.2lfms\n",
-                   per_device_state,
-                   input,
-                   output);
-}
-
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
+  return profile(forward_kernel,
+                 profiling,
+                 "[Reparition/Partition] forward_time = {:.2lf}ms\n",
+                 per_device_state,
+                 input,
+                 output);
 }
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
   auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
-  return profiling(backward,
-                   profiling,
-                   "[Reparition/Partition] backward_time = %.2lfms\n",
-                   per_device_state,
-                   input_grad,
-                   output_grad);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reparition/Partition] backward_time = {:.2lf}ms\n",
+                 per_device_state,
+                 output_grad,
+                 input_grad);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -144,7 +100,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
@@ -165,8 +121,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto fwd_accessor = env.get_accessor(REPARTITION_FWD_TASK_ID, fwd_binding);
-  auto bwd_accessor = env.get_accessor(REPARTITION_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor =
+      env.get_fwd_accessor(REPARTITION_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor =
+      env.get_bwd_accessor(REPARTITION_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();
@@ -185,7 +143,8 @@ void register_task<REPARTITION_INIT_TASK_ID>() {
 
   init.add_return_value<RepartitionPerDeviceState>();
 
-  register_task(REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task);
+  register_task(
+      REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task_impl);
 }
 
 template <>
@@ -197,15 +156,19 @@ void register_task<REPARTITION_FWD_TASK_ID>() {
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<RepartitionPerDeviceState>(PER_DEVICE_STATE);
 
-  register_task(REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task);
+  register_task(
+      REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REPARTITION_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<REPARTITION_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID));
+
+//   register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
similarity index 57%
rename from lib/runtime/src/ops/pool_2d.cc
rename to lib/local-execution/src/ops/pool_2d.cc
index 577837c960..989f390380 100644
--- a/lib/runtime/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -1,10 +1,10 @@
 #include "pool_2d.h"
 #include "kernels/pool_2d_kernels.h"
-#include "legion/legion_utilities.h"
+
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/pool_2d.h"
 #include "utils/exception.decl.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Pool2D;
@@ -23,13 +23,13 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) {
   return {POOL2D_INIT_TASK_ID, binding};
 }
 
-static DeviceSpecific<Pool2dPerDeviceState>
+static DeviceSpecific<Pool2DPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<Pool2DAttrs>(ATTRS);
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   int input_w = input.shape.at(ff_dim_t(0)) + 1;
   int input_h = input.shape.at(ff_dim_t(1)) + 1;
@@ -64,37 +64,27 @@ static DeviceSpecific<Pool2dPerDeviceState>
     printf("Warning: changing pool_padding_w to satisfy output_w size\n");
   }
 
-  DeviceSpecific<Pool2dPerDeviceState> state = acc.create_device_specific<Pool2dPerDeviceState>(
-              init_kernel(handle,
-                          attrs.activation,
-                          input_w,
-                          input_h,
-                          input_c,
-                          input_n,
-                          output_w,
-                          output_h,
-                          output_c,
-                          output_n,
-                          pad_h,
-                          pad_w,
-                          attrs.kernel_h,
-                          attrs.kernel_w,
-                          attrs.stride_h,
-                          attrs.stride_w,
-                          attrs.pool_type);
+  DeviceSpecific<Pool2DPerDeviceState> state = init_kernel(handle,
+                                                           attrs.activation,
+                                                           input_w,
+                                                           input_h,
+                                                           input_c,
+                                                           input_n,
+                                                           output_w,
+                                                           output_h,
+                                                           output_c,
+                                                           output_n,
+                                                           pad_h,
+                                                           pad_w,
+                                                           attrs.kernel_h,
+                                                           attrs.kernel_w,
+                                                           attrs.stride_h,
+                                                           attrs.stride_w,
+                                                           attrs.pool_type);
 
   return state;
 }
 
-static DeviceSpecific<Pool2dPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
 OpTaskInvocation forward(Pool2DAttrs const &attrs) {
   OpTaskBinding binding;
   binding.bind(INPUT, input_tensor(0));
@@ -102,54 +92,47 @@ OpTaskInvocation forward(Pool2DAttrs const &attrs) {
 
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<Pool2dPerDeviceState>());
+                   per_device_op_state<Pool2DPerDeviceState>());
 
   return {POOL2D_FWD_TASK_ID, binding};
 }
 
-OpTaskInvocation backward(Pool2DAttrs const &) {
+OpTaskInvocation backward(Pool2DAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
   return {POOL2D_BWD_TASK_ID, b};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  Pool2dPerDeviceState state =
-      acc.get_argument<Pool2dPerDeviceState>(PER_DEVICE_STATE);
+  Pool2DPerDeviceState state =
+      acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
-                 profilng,
-                 "[Pool2D] forward_time = %.2lfms\n",
+                 profiling,
+                 "[Pool2D] forward_time = {:.2lf}ms\n",
                  state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  Pool2dPerDeviceState state =
-      acc.get_argument<Pool2dPerDeviceState>(PER_DEVICE_STATE);
+  Pool2DPerDeviceState state =
+      acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permission::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permission::RW>(INPUT);
-  auto output = acc.get_tensor<Permission::RO>(OUTPUT);
-  auto output_grad = acc.get_tensor<Permission::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
+  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
-                 profilng,
-                 "[Pool2D] backward_time = %.2lfms\n",
+                 profiling,
+                 "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
                  input.get_float_ptr(),
                  input_grad.get_float_ptr(),
@@ -157,20 +140,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  output_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   Pool2DAttrs const &attrs,
-                                  ParallelTensorShape const &input,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
@@ -181,21 +156,21 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   auto init_accessor = env.get_init_accessor(POOL2D_INIT_TASK_ID, init_binding);
 
-  DeviceSpecific<Pool2dPerDeviceState> per_device_state =
+  DeviceSpecific<Pool2DPerDeviceState> per_device_state =
       init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
 
-  fwd_binding.bind(INPUT, input_shape);
+  fwd_binding.bind(INPUT, input.shape);
   fwd_binding.bind(OUTPUT, output_shape);
   fwd_binding.bind_arg(PROFILING, settings);
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
-  auto fwd_accessor = env.get_accessor(POOL2D_FWD_TASK_ID, fwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(POOL2D_FWD_TASK_ID, fwd_binding);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto bwd_accessor = env.get_accessor(POOL2D_BWD_TASK_ID, bwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(POOL2D_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();
@@ -217,7 +192,7 @@ void register_task<POOL2D_INIT_TASK_ID>() {
 
   init.add_return_value<FlexFlow::Pool2DPerDeviceState>();
 
-  register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_taks);
+  register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_task_impl);
 }
 
 template <>
@@ -228,17 +203,20 @@ void register_task<POOL2D_FWD_TASK_ID>() {
   fwd.add_output_slot(OUTPUT);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
-  fwd.add_arg_slot<Pool2DPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_unchecked_arg_slot<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task);
+  register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<POOL2D_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd, backward_task);
-}
+// template <>
+// void register_task<POOL2D_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID));
+
+//   register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/pool_2d.h b/lib/local-execution/src/ops/pool_2d.h
similarity index 95%
rename from lib/runtime/src/ops/pool_2d.h
rename to lib/local-execution/src/ops/pool_2d.h
index f8701f461e..0537e9f1c4 100644
--- a/lib/runtime/src/ops/pool_2d.h
+++ b/lib/local-execution/src/ops/pool_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_POOL_2D_H
 #define _FLEXFLOW_POOL_2D_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/pool_2d.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
 
 namespace FlexFlow {
 
@@ -20,7 +20,7 @@ OpTaskInvocation backward(Pool2DAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   Pool2DAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input_shape,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 
diff --git a/lib/runtime/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
similarity index 57%
rename from lib/runtime/src/ops/reduce.cc
rename to lib/local-execution/src/ops/reduce.cc
index 2674dc4fef..98d1a6f522 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -1,27 +1,12 @@
 #include "reduce.h"
 #include "kernels/reduce_kernels.h"
-#include "legion/legion_utilities.h"
-#include "op-attrs/get_output_shape.h"
-#include "utils/exceptions.h"
+
+#include "op-attrs/get_output_shapes.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include "utils/type_traits_core.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reduce;
 
@@ -35,7 +20,7 @@ enum Slots {
   HANDLE
 };
 
-OpTaskInvocation init(TransposeAttrs const &attrs) {
+OpTaskInvocation init(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(HANDLE, ff_handle());
@@ -54,42 +39,33 @@ static DeviceSpecific<ReducePerDeviceState>
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  OperatorType = attrs.op_type;
+  OperatorType op_type = attrs.op_type;
   // Note: How to set the reduction size?
   size_t reduction_size = input.shape.get_volume() / output.shape.get_volume();
   DeviceSpecific<ReducePerDeviceState> per_device_state =
-      acc.create_device_specific<ReducePerDeviceState>(init_kernel(
-          handle, op_type, reduction_size, input.shape, output.shape));
+      init_kernel(handle, op_type, reduction_size, input.shape, output.shape);
   return per_device_state;
 }
 
-static DeviceSpecific<TransposePerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
 template <>
 void register_task<TRANSPOSE_INIT_TASK_ID>() {
-  OpTaskSignature init(OpTaskType::INIT)
+  OpTaskSignature init(OpTaskType::INIT);
 
-      init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
 
   init.add_return_value<ReducePerDeviceState>();
 
-  register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
+  register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task_impl);
 }
 
 // Note: forward_kernel only needs ReducePerDeviceState, input, output
 OpTaskInvocation forward(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
-  bind.bind_arg(PROFILING, profiling_tensor());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<ReducePerDeviceState>());
+  binding.bind_arg(PROFILING, profiling_settings());
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
@@ -97,7 +73,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) {
   return {REDUCE_FWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -107,31 +83,23 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[Reduce] forward_time = %.2lfms\n",
+                 "[Reduce] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
 template <>
 void register_task<REDUCE_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FORWARD);
+  OpTaskSignature fwd(OpTaskType::FWD);
 
-  fwd.add_unchecked_arg_slot<PerDeviceOpState>(PER_DEVICE_STATE);
+  fwd.add_unchecked_arg_slot<ReducePerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task);
+  register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task_impl);
 }
 
 OpTaskInvocation backward(ReduceAttrs const &attrs) {
@@ -140,48 +108,44 @@ OpTaskInvocation backward(ReduceAttrs const &attrs) {
   return {REDUCE_BWD_TASK_ID, binding};
 }
 
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
-                 "[Reduce] backward_time = %.2lfms\n",
+                 "[Reduce] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
+// TODO: OpTaskSignature
 
-template <>
-void register_task<REDUCE_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
+// template <>
+// void register_task<REDUCE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
 
-  reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
-}
+//   register_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd,
+//   backward_task_impl);
+// }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReduceAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(HANDLE, ff_handle());
+  init_binding.bind_arg(HANDLE, ff_handle());
 
   auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding);
   DeviceSpecific<ReducePerDeviceState> per_device_state =
@@ -189,10 +153,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   SimTaskBinding fwd_binding;
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
-  fwd.bind(INPUT, input.shape);
-  fwd.bind(OUTPUT, output_shape);
-  fwd.bind_arg(PROFILING, settings);
-  fwd.bind_arg(PER_DEVICE_STATE, per_device_state);
+  fwd_binding.bind(INPUT, input.shape);
+  fwd_binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind_arg(PROFILING, settings);
+  fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
diff --git a/lib/runtime/src/ops/reduce.h b/lib/local-execution/src/ops/reduce.h
similarity index 96%
rename from lib/runtime/src/ops/reduce.h
rename to lib/local-execution/src/ops/reduce.h
index 099083ed67..6d47ec2f4d 100644
--- a/lib/runtime/src/ops/reduce.h
+++ b/lib/local-execution/src/ops/reduce.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reduce.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
similarity index 59%
rename from lib/runtime/src/ops/reduction.cc
rename to lib/local-execution/src/ops/reduction.cc
index 9a11d3a6f5..3fa300f64d 100644
--- a/lib/runtime/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -13,32 +13,14 @@
  * limitations under the License.
  */
 
-#include "parallel_ops/reduction.h"
+#include "reduction.h"
 #include "kernels/reduction_kernels.h"
-#include "op-attrs/get_output_shape.h"
-#include "utils/exceptions.h"
+#include "op-attrs/get_output_shapes.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reduction;
 
@@ -61,7 +43,7 @@ OpTaskInvocation backward(ReductionAttrs const &attrs) {
   return {REDUCTION_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling_settings =
       acc.get_argument<ProfilingSettings>(PROFILING);
 
@@ -71,40 +53,25 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   size_t num_replicas = attrs.reduction_degree;
 
-  return profiling(forward_kernel,
-                   profiling_settings,
-                   "[Reduction] forward_time = %.2lfms\n",
-                   input,
-                   output,
-                   num_replicas);
+  return profile(forward_kernel,
+                 profiling_settings,
+                 "[Reduction] forward_time = {:.2lf}ms\n",
+                 input,
+                 output,
+                 num_replicas);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-  return profiling(backward_kernel,
-                   profiling,
-                   "[Reduction] backward_time = %.2lfms\n",
-                   input_grad,
-                   output_grad);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reduction] backward_time = {:.2lf}ms\n",
+                 input_grad,
+                 output_grad);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -114,13 +81,13 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   MachineView const &machine_view) {
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind_arg(PROFILING, settings);
   fwd_binding.bind_arg(ATTRS, attrs);
   fwd_binding.bind(INPUT, input.shape);
-  fwd.binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind(OUTPUT, output_shape);
 
   auto fwd_accessor = env.get_fwd_accessor(REDUCTION_FWD_TASK_ID, fwd_binding);
 
@@ -145,15 +112,18 @@ void register_task<REDUCTION_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task);
+  register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REDUCTION_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<REDUCTION_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID));
+
+//   register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reduction.h b/lib/local-execution/src/ops/reduction.h
similarity index 94%
rename from lib/runtime/src/ops/reduction.h
rename to lib/local-execution/src/ops/reduction.h
index 978ca6b080..a69b75f310 100644
--- a/lib/runtime/src/ops/reduction.h
+++ b/lib/local-execution/src/ops/reduction.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REDUCTION_H
 #define _FLEXFLOW_REDUCTION_H
 
-#include "op-attrs/ops/combine.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
+#include "op-attrs/ops/reduction.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/repartition.h b/lib/local-execution/src/ops/repartition.h
similarity index 97%
rename from lib/runtime/src/ops/repartition.h
rename to lib/local-execution/src/ops/repartition.h
index fccc0de7be..a73bd3f808 100644
--- a/lib/runtime/src/ops/repartition.h
+++ b/lib/local-execution/src/ops/repartition.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_PARTITION_H
 #define _FLEXFLOW_PARTITION_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/repartition.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
similarity index 66%
rename from lib/runtime/src/ops/replicate.cc
rename to lib/local-execution/src/ops/replicate.cc
index 1675a62c5f..a441985b78 100644
--- a/lib/runtime/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -13,39 +13,21 @@
  * limitations under the License.
  */
 
-#include "parallel_ops/replicate.h"
+#include "replicate.h"
 #include "kernels/replicate_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/graph/serialparallel.h"
 #include "utils/hash-utils.h"
 #include <sys/types.h>
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::LogicalPartition;
-using Legion::LogicalRegion;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Replicate;
 
-enum Slots { INPUT, OUTPUT, PROFILING };
+enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
 
 OpTaskInvocation forward(ReplicateAttrs const &attrs) {
   OpTaskBinding binding;
@@ -54,6 +36,7 @@ OpTaskInvocation forward(ReplicateAttrs const &attrs) {
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(ATTRS, attrs);
 
   return {REPLICATE_FWD_TASK_ID, binding};
 }
@@ -63,7 +46,7 @@ OpTaskInvocation backward(ReplicateAttrs const &attrs) {
   return {REPLICATE_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -71,38 +54,25 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[replicate] forward_time = %.2lfms\n",
+                 "[replicate] forward_time = {:.2lf}ms\n",
                  input,
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto const &attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
 
   return profile(backward_kernel,
                  profiling,
-                 "[replicate] backward_time = %.2lfms\n",
+                 "[replicate] backward_time = {:.2lf}ms\n",
                  input_grad,
-                 output_grad);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 output_grad,
+                 attrs.replicate_degree);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -110,7 +80,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
   SimTaskBinding fwd_binding;
   fwd_binding.bind_arg(PROFILING, settings);
   ParallelTensorShape output = get_output_shape(attrs, input.shape);
@@ -136,14 +106,18 @@ void register_task<REPLICATE_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task);
+  register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REPLICATE_BWD_TASK_ID>() {
-  OpTaskSignature bwd = infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<REPLICATE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//   infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID));
+
+//   register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/replicate.h b/lib/local-execution/src/ops/replicate.h
similarity index 89%
rename from lib/runtime/src/ops/replicate.h
rename to lib/local-execution/src/ops/replicate.h
index da2b71f098..339f805f2c 100644
--- a/lib/runtime/src/ops/replicate.h
+++ b/lib/local-execution/src/ops/replicate.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REPLICATE_H
 #define _FLEXFLOW_REPLICATE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/replicate.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
similarity index 68%
rename from lib/runtime/src/ops/reshape.cc
rename to lib/local-execution/src/ops/reshape.cc
index c9dc8cff8d..efee73645b 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -15,24 +15,10 @@
 
 #include "reshape.h"
 #include "kernels/reshape_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 
 namespace FlexFlow {
 // declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reshape;
 
@@ -69,74 +55,50 @@ static DeviceSpecific<ReshapePerDeviceState>
   auto attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
 
   DeviceSpecific<ReshapePerDeviceState> per_device_state =
-      acc.create_device_specific<ReshapePerDeviceState>(
-          init_kernel(attrs.shape.data_type));
+      init_kernel(attrs.shape.data_type);
   return per_device_state;
 }
 
-static DeviceSpecific<ReshapePerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
-  Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
-                 "[Reshape] forward time = %.2lfms\n",
+                 "[Reshape] forward time = {:.2lf}ms\n",
                  per_device_state,
                  input,
                  output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
-  Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
-                 "[Reshape] backward time = %.2lfms\n",
+                 "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
                  input_grad,
                  output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReshapeAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
 
+  auto env = sim_factory.new_environment();
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
   auto init_accessor =
@@ -168,9 +130,9 @@ void register_task<RESHAPE_INIT_TASK_ID>() {
 
   init.add_arg_slot<ReshapeAttrs>(ATTRS);
 
-  init.add_return_value<ReshapePerDeviceState>(PER_DEVICE_STATE);
+  init.add_return_value<ReshapePerDeviceState>();
 
-  register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
+  register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task_impl);
 }
 
 template <>
@@ -183,15 +145,17 @@ void register_task<RESHAPE_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task);
+  register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<RESHAPE_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<RESHAPE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
+
+//   register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reshape.h b/lib/local-execution/src/ops/reshape.h
similarity index 97%
rename from lib/runtime/src/ops/reshape.h
rename to lib/local-execution/src/ops/reshape.h
index f044e3f057..14b22561a0 100644
--- a/lib/runtime/src/ops/reshape.h
+++ b/lib/local-execution/src/ops/reshape.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RESHAPE_H
 #define _FLEXFLOW_RESHAPE_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reshape.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
similarity index 67%
rename from lib/runtime/src/ops/reverse.cc
rename to lib/local-execution/src/ops/reverse.cc
index ac64146cd1..7fefb3d357 100644
--- a/lib/runtime/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -19,23 +19,9 @@
 #include "op-attrs/get_output_shapes.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reverse;
+using coord_t = long long;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
 
@@ -43,7 +29,7 @@ OpTaskInvocation forward(ReverseAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PROFILING, profiling_settings());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
@@ -56,28 +42,28 @@ OpTaskInvocation backward(ReverseAttrs const &attrs) {
   return {REVERSE_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int output_size = outtput.shape.get_volume();
+  int output_size = output.shape.get_volume();
   auto axis = attrs.axis;
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < output.shape.get_dim(); i++) {
     if (i < axis) {
-      in_blk_size *= output.shape[i];
+      in_blk_size *= output.shape.at(ff_dim_t(i));
     } else if (i == axis) {
-      reverse_dim_size = output.shape[i];
+      reverse_dim_size = output.shape.at(ff_dim_t(i));
     } else {
-      num_out_blks *= output.shape[i];
+      num_out_blks *= output.shape.at(ff_dim_t(i));
     }
   }
 
   return profile(forward_kernel,
                  profiling,
-                 "[reverse] forward_time = %.2lfms\n",
+                 "[reverse] forward_time = {:.2lf}ms\n",
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  num_out_blks,
@@ -86,49 +72,34 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output_size);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input.shape.get_dim() - attrs.axis - 1;
+  int axis = input_grad.shape.get_dim() - attrs.axis.value() - 1;
   coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
   for (int i = 0; i < input_grad.shape.get_dim(); i++) {
     if (i < axis) {
-      in_blk_size *= input_grad.shape[i];
+      in_blk_size *= input_grad.shape.at(ff_dim_t(i));
     } else if (i == axis) {
-      reverse_dim_size = input_grad.shape[i];
+      reverse_dim_size = input_grad.shape.at(ff_dim_t(i));
     } else {
-      num_out_blks *= input_grad.shape[i];
+      num_out_blks *= input_grad.shape.at(ff_dim_t(i));
     }
   }
 
   return profile(backward_kernel,
                  profiling,
-                 "[reverse] backward_time = %.2lfms\n",
+                 "[reverse] backward_time = {:.2lf}ms\n",
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  num_out_blks,
                  reverse_dim_size,
                  in_blk_size,
-                 input.shape.get_volume());
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+                 input_grad.shape.get_volume());
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -136,7 +107,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding fwd_binding;
 
@@ -161,21 +132,23 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 }
 
 template <>
-void register_task<REVERSE_FWD_TASK_ID>()) {
+void register_task<REVERSE_FWD_TASK_ID>() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task);
+  register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<REVERSE_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID));
-  register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, backward_task);
-}
+// TODO: OpTaskSignature
+// template <>
+// void register_task<REVERSE_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID));
+//   register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd,
+//   backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reverse.h b/lib/local-execution/src/ops/reverse.h
similarity index 89%
rename from lib/runtime/src/ops/reverse.h
rename to lib/local-execution/src/ops/reverse.h
index af4d335429..5be501698c 100644
--- a/lib/runtime/src/ops/reverse.h
+++ b/lib/local-execution/src/ops/reverse.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REVERSE_H_
 #define _FLEXFLOW_REVERSE_H_
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/reverse.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
similarity index 68%
rename from lib/runtime/src/ops/softmax.cc
rename to lib/local-execution/src/ops/softmax.cc
index b67f9730a4..ea857c680b 100644
--- a/lib/runtime/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -17,26 +17,10 @@
 #include "kernels/softmax_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
 using namespace FlexFlow::Kernels::Softmax;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE };
@@ -75,21 +59,11 @@ static DeviceSpecific<SoftmaxPerDeviceState>
   auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
 
   DeviceSpecific<SoftmaxPerDeviceState> per_device_state =
-      acc.create_device_specific<SoftmaxPerDeviceState>(
-          init_kernel(handle, attrs.dim));
+      init_kernel(handle, attrs.dim.value());
   return per_device_state;
 }
 
-static DeviceSpecific<SoftmaxPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -98,21 +72,14 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
-                 "[SoftMax] forward_time = %.2lfms\n",
+                 "[SoftMax] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
-                 output.get_float_ptr(), );
+                 output.get_float_ptr());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -124,22 +91,12 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   assert(output_grad.shape == output.shape);
 
-  return profile(
-      backward_kernel,
-      profiling,
-      "[SoftMax] backward_time = %.2lfms\n",
-      input_grad.get_float_ptr(),
-      output_grad.get_float_ptr(),
-      output_grad.shape.volume(), // Note(lambda): get num_elements, maybe wrong
-  );
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  return profile(backward_kernel,
+                 profiling,
+                 "[SoftMax] backward_time = {:.2lf}ms\n",
+                 input_grad.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 output_grad.shape.get_volume());
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -147,7 +104,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
@@ -162,7 +119,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
       init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
-  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
   fwd_binding.bind(INPUT, input.shape);
   fwd_binding.bind(OUTPUT, output_shape);
   fwd_binding.bind_arg(PROFILING, settings);
@@ -186,9 +142,9 @@ void register_task<SOFTMAX_INIT_TASK_ID>() {
 
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<SoftmaxAttrs>(ATTRS);
-  init.add_return_value_slot<SoftmaxPerDeviceState>();
+  init.add_return_value<SoftmaxPerDeviceState>();
 
-  register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task);
+  register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task_impl);
 }
 
 template <>
@@ -201,15 +157,17 @@ void register_task<SOFTMAX_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
-  register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task);
+  register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<SOFTMAX_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task);
-}
+// template <>
+// void register_task<SOFTMAX_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID));
+
+//   register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/softmax.h b/lib/local-execution/src/ops/softmax.h
similarity index 97%
rename from lib/runtime/src/ops/softmax.h
rename to lib/local-execution/src/ops/softmax.h
index 06b9d09d60..a83d8f4116 100644
--- a/lib/runtime/src/ops/softmax.h
+++ b/lib/local-execution/src/ops/softmax.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SOFTMAX_H
 #define _FLEXFLOW_SOFTMAX_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/softmax.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
similarity index 58%
rename from lib/runtime/src/ops/split.cc
rename to lib/local-execution/src/ops/split.cc
index 2af5d42874..13e95d37f9 100644
--- a/lib/runtime/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -16,28 +16,14 @@
 #include "split.h"
 #include "kernels/array_shape.h"
 #include "kernels/split_kernels.h"
-#include "utils/exceptions.h"
+#include "op-attrs/get_output_shapes.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
 
 using namespace FlexFlow::Kernels::Split;
+using coord_t = long long;
 
 enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
 
@@ -58,96 +44,86 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
   return {SPLIT_BWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+void calc_block_size(coord_t &num_blocks,
+                     coord_t &block_size,
+                     ArrayShape const &array_shape,
+                     int axis) {
+  num_blocks = 1;
+  block_size = 1;
+  for (int d = 0; d < array_shape.num_elements(); d++) {
+    if (d <= axis) {
+      block_size *= array_shape.at(legion_dim_t(d));
+    } else {
+      num_blocks *= array_shape.at(legion_dim_t(d));
+    }
+  }
+}
+
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis);
+  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
+  calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis.value());
 
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blks;
+    coord_t out_num_blocks;
     calc_block_size(
-        out_num_blks, out_blk_size[i], output.shape, split->legion_axis);
+        out_num_blocks, out_block_size[i], output.shape, attrs.axis.value());
   }
+  float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
                  profiling,
-                 "Split forward_time = %.2lfms\n",
-                 &output.get_float_ptr(),
+                 "Split forward_time = {:.2lf}ms\n",
+                 &output_float_ptr,
                  input.get_float_ptr(),
-                 out_blk_size,
-                 in_blk_size,
-                 num_blks,
+                 out_block_size,
+                 in_block_size,
+                 num_blocks,
                  attrs.splits.size());
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
 // maybe we should add assert like the original code
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blks, in_blk_size, input_grade.shape, attrs.axis);
+  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
+  calc_block_size(
+      num_blocks, in_block_size, input_grad.shape, attrs.axis.value());
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blks;
-    calc_block_size(
-        out_num_blks, out_blk_size[i], output_grad.shape, split->legion_axis);
+    coord_t out_num_blocks;
+    calc_block_size(out_num_blocks,
+                    out_block_size[i],
+                    output_grad.shape,
+                    attrs.axis.value());
   }
+  float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
                  profiling,
-                 "Split backward_time = %.2lfms\n",
+                 "Split backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
-                 &output_grad.get_float_ptr(),
-                 out_blk_size,
-                 in_blk_size,
-                 num_blks,
+                 &output_grad_ptr,
+                 out_block_size,
+                 in_block_size,
+                 num_blocks,
                  attrs.splits.size());
 }
 
-void calc_block_size(coord_t &num_blks,
-                     coord_t &blk_size,
-                     ArrayShape const &array_shape,
-                     int axis) {
-  num_blks = 1;
-  blk_size = 1;
-  for (int d = 0; d < array_shape.get_dim(); d++) {
-    if (d <= axis) {
-      blk_size *= (domain.hi()[d] - domain.lo()[d] + 1);
-      blk_size *= array_shape.at(legion_dim_t(d)) + 1
-    } else {
-      num_blks *= array_shape.at(legion_dim_t(d)) + 1
-    }
-  }
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   SplitAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+  std::vector<ParallelTensorShape> output_shape =
+      get_output_shapes(attrs, input.shape);
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind(INPUT, input.shape);
@@ -166,6 +142,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
+// TODO: OpTaskSignature
+
 template <>
 void register_task<SPLIT_FWD_TASK_ID>() {
   OpTaskSignature fwd(OpTaskType::FWD);
@@ -174,15 +152,15 @@ void register_task<SPLIT_FWD_TASK_ID>() {
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
-  register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task);
+  register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<SPLIT_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID));
+// template <>
+// void register_task<SPLIT_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//       infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID));
 
-  register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task);
-}
+//   register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/split.h b/lib/local-execution/src/ops/split.h
similarity index 93%
rename from lib/runtime/src/ops/split.h
rename to lib/local-execution/src/ops/split.h
index d63212e836..f51e0ea6af 100644
--- a/lib/runtime/src/ops/split.h
+++ b/lib/local-execution/src/ops/split.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SPLIT_H
 #define _FLEXFLOW_SPLIT_H
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/split.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
@@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   SplitAttrs const &attrs,
-                                  InputParallelTensorDes const &input,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 
diff --git a/lib/runtime/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
similarity index 60%
rename from lib/runtime/src/ops/topk.cc
rename to lib/local-execution/src/ops/topk.cc
index 958516a6d9..8aceb9c6d4 100644
--- a/lib/runtime/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -16,28 +16,9 @@
 #include "topk.h"
 #include "kernels/topk_kernels.h"
 #include "op-attrs/get_output_shapes.h"
-#include "utils/exceptions.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
 
 using namespace FlexFlow::Kernels::TopK;
 
@@ -50,7 +31,7 @@ enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE };
 OpTaskInvocation init(TopKAttrs const &attrs) {
   OpTaskBinding binding;
 
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
 
   return {TOPK_INIT_TASK_ID, binding};
 }
@@ -60,7 +41,7 @@ OpTaskInvocation forward(TopKAttrs const &attrs) {
 
   binding.bind_arg(PER_DEVICE_STATE, per_device_op_state<TopKPerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
-  bind.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
@@ -81,23 +62,14 @@ static DeviceSpecific<TopKPerDeviceState>
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
 
   DeviceSpecific<TopKPerDeviceState> per_device_state =
-      acc.create_device_specific<TopKPerDeviceState>(init_kernel(attrs.sorted));
+      init_kernel(attrs.sorted);
   return per_device_state;
 }
 
-static DeviceSpecific<TopKPerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
-
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
   auto per_device_state =
-      acc.get_device_specific<TopKPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -107,31 +79,24 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   size_t batch_size = input.shape.get_volume() / length;
   auto indices = acc.get_tensor<Permissions::WO>(INDICES);
 
-  return profiling(forward_kernel,
-                   profiling,
-                   "[TopK] forward_time = %.2lfms\n",
-                   per_device_state,
-                   input.get_float_ptr(),
-                   output.get_float_ptr(),
-                   indices.get_int32_ptr(),
-                   batch_size,
-                   length,
-                   attrs.k,
-                   attrs.sorted);
+  return profile(forward_kernel,
+                 profiling,
+                 "[TopK] forward_time = {:.2lf}ms\n",
+                 per_device_state,
+                 input.get_float_ptr(),
+                 output.get_float_ptr(),
+                 indices.get_int32_ptr(),
+                 batch_size,
+                 length,
+                 attrs.k,
+                 attrs.sorted);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
   auto per_device_state =
-      acc.get_device_specific<TopKPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -139,27 +104,19 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
 
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
-  int length = input.shape.at(legion_dim_t(0)) + 1;
-  size_t batch_size = input.shape.get_volume() / length;
-
-  return profiling(backward_kernel,
-                   profiling,
-                   "[TopK] backward_time = %.2lfms\n",
-                   per_device_state,
-                   output_grad.get_float_ptr(),
-                   indices.get_int32_ptr(),
-                   input_grad.get_float_ptr(),
-                   batch_size,
-                   length,
-                   attrs.k);
-}
-
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
+  int length = input_grad.shape.at(legion_dim_t(0)) + 1;
+  size_t batch_size = input_grad.shape.get_volume() / length;
+
+  return profile(backward_kernel,
+                 profiling,
+                 "[TopK] backward_time = {:.2lf}ms\n",
+                 per_device_state,
+                 output_grad.get_float_ptr(),
+                 indices.get_int32_ptr(),
+                 input_grad.get_float_ptr(),
+                 batch_size,
+                 length,
+                 attrs.k);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -167,9 +124,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
-  ParallelTensorShape output_shape = get_output_shapes(attrs, input.shape);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
@@ -204,7 +161,7 @@ void register_task<TOPK_INIT_TASK_ID>() {
 
   init.add_arg_slot<TopKAttrs>(ATTRS); // Note: this may have some question
   init.add_return_value<TopKPerDeviceState>();
-  register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task);
+  register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task_impl);
 }
 
 template <>
@@ -219,14 +176,17 @@ void register_task<TOPK_FWD_TASK_ID>() {
   fwd.add_output_slot(OUTPUT);
   fwd.add_output_slot(INDICES);
 
-  register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task);
+  register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task_impl);
 }
 
-template <>
-void register_task<TOPK_BWD_TASK_ID>() {
-  OpTaskSignature bwd = infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID));
+// TODO: OpTaskSignature
 
-  register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task);
-}
+// template <>
+// void register_task<TOPK_BWD_TASK_ID>() {
+//   OpTaskSignature bwd =
+//   infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID));
+
+//   register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task_impl);
+// }
 
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/topk.h b/lib/local-execution/src/ops/topk.h
similarity index 97%
rename from lib/runtime/src/ops/topk.h
rename to lib/local-execution/src/ops/topk.h
index f15ff6de81..db85fd9d03 100644
--- a/lib/runtime/src/ops/topk.h
+++ b/lib/local-execution/src/ops/topk.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TOPK_H_
 #define _FLEXFLOW_TOPK_H_
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/topk.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
similarity index 55%
rename from lib/runtime/src/ops/transpose.cc
rename to lib/local-execution/src/ops/transpose.cc
index ea6182772f..c998484455 100644
--- a/lib/runtime/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -15,27 +15,10 @@
 
 #include "transpose.h"
 #include "kernels/transpose_kernels.h"
-#include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/transpose.h"
 #include "utils/exception.decl.h"
 
-namespace FlexFlow {
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
 using namespace FlexFlow::Kernels::Transpose;
 
 namespace FlexFlow {
@@ -57,33 +40,26 @@ OpTaskInvocation init(TransposeAttrs const &attrs) {
 static DeviceSpecific<TransposePerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-  std::vector<int> perm = attrs.perm; // default convert stack_vector to vector
+  std::vector<ff_dim_t> perm = static_cast<std::vector<ff_dim_t>>(attrs.perm);
   DeviceSpecific<TransposePerDeviceState> per_device_state =
-      acc.create_device_specific<TransposePerDeviceState>(
-          init_kernel(perm.size(), perm));
+      init_kernel(perm.size(), perm);
 
   return per_device_state;
 }
 
-static DeviceSpecific<TransposePerDeviceState>
-    init_task(Task const *task,
-              std::vector<PhysicalRegion> const &regions,
-              Context ctx,
-              Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  return init_task_impl(acc);
-}
+// TODO: OpTaskSignature
 
-template <>
-void register_task<TRANSPOSE_INIT_TASK_ID>();
-OpTaskSignature init(OpTaskType::INIT)
+// template <>
+// void register_task<TRANSPOSE_INIT_TASK_ID>() {
+// OpTaskSignature init(OpTaskType::INIT);
 
-    init.add_arg_slot<TransposeAttrs>(ATTRS);
+//     init.add_arg_slot<TransposeAttrs>(ATTRS);
 
-init.add_return_value<TransposePerDeviceState>();
+// init.add_return_value<TransposePerDeviceState>();
 
-register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init, init_task);
-} // namespace FlexFlow
+// register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init,
+// init_task_impl);
+// }
 
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
@@ -92,13 +68,13 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) {
                    per_device_op_state<TransposePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
 
-  bind.bind(INPUT, input_tensor(0));
-  bind.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
   return {TRANSPOSE_FWD_TASK_ID, binding};
 }
 
-static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
@@ -106,47 +82,32 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  return profiling(forward_kernel,
-                   profiling,
-                   "[Transpose] Forward_time = %.2lf [ms]",
-                   per_device_state,
-                   input,
-                   output);
+  return profile(forward_kernel,
+                 profiling,
+                 "[Transpose] Forward_time = {:.2lf} [ms]",
+                 per_device_state,
+                 input,
+                 output);
 }
 
-static void forward_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  forward_task_impl(acc);
-}
-
-static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
-      acc.get_per_device_state<TransposePerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
-  return profiling(backward_kernel,
-                   profiling,
-                   "[Transpose] Backward_time = %.2lf [ms]",
-                   per_device_state,
-                   input_grad,
-                   output_grad);
+  return profile(backward_kernel,
+                 profiling,
+                 "[Transpose] Backward_time = {:.2lf} [ms]",
+                 per_device_state,
+                 input_grad,
+                 output_grad);
 }
 
-static void backward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  backward_task_impl(acc);
-}
-
-OpTaskInvocation backward(TransposeAttrs const &) {
+OpTaskInvocation backward(TransposeAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
   return {TRANSPOSE_BWD_TASK_ID, binding};
@@ -159,7 +120,7 @@ CostMetrics
                               &input_descs, // Note:this may have some problem
                           ProfilingSettings const &settings,
                           MachineView const &machine_view) {
-  auto env = sim.new_environment();
+  auto env = sim_factory.new_environment();
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
@@ -169,12 +130,13 @@ CostMetrics
   DeviceSpecific<TransposePerDeviceState> per_device_state =
       init_task_impl(init_accessor);
 
-  ParallelTensorShape output_shape = get_output_shape(attrs, input_descs.shape);
+  ParallelTensorShape output_shape =
+      get_output_shape(attrs, input_descs.shapes);
 
   SimTaskBinding fwd_binding;
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
   fwd_binding.bind_arg(PROFILING, settings);
-  fwd_binding.bind(INPUT, input_descs.shape);
+  fwd_binding.bind(INPUT, input_descs.shapes);
   fwd_binding.bind(OUTPUT, output_shape);
 
   auto fwd_accessor = env.get_fwd_accessor(TRANSPOSE_FWD_TASK_ID, fwd_binding);
@@ -189,4 +151,4 @@ CostMetrics
   return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/lib/runtime/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h
similarity index 97%
rename from lib/runtime/src/ops/transpose.h
rename to lib/local-execution/src/ops/transpose.h
index 52e824ebbf..daa64e8e59 100644
--- a/lib/runtime/src/ops/transpose.h
+++ b/lib/local-execution/src/ops/transpose.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TRANSPOSE_H_
 #define _FLEXFLOW_TRANSPOSE_H_
 
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/sim_environment.h"
 #include "op-attrs/ops/transpose.h"
-#include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/permissions.cc b/lib/local-execution/src/permissions.cc
similarity index 66%
rename from lib/runtime/src/permissions.cc
rename to lib/local-execution/src/permissions.cc
index 2992780ae1..e5c46b42f8 100644
--- a/lib/runtime/src/permissions.cc
+++ b/lib/local-execution/src/permissions.cc
@@ -1,38 +1,8 @@
-#include "permissions.h"
+#include "local-execution/permissions.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
 
-Legion::PrivilegeMode to_legion(Permissions p) {
-  switch (p) {
-    case Permissions::NONE:
-      return LEGION_NO_ACCESS;
-    case Permissions::RO:
-      return LEGION_READ_ONLY;
-    case Permissions::WO:
-      return LEGION_WRITE_ONLY;
-    case Permissions::RW:
-      return LEGION_READ_WRITE;
-    default:
-      throw mk_runtime_error("Unknown permission {}", static_cast<int>(p));
-  }
-}
-
-optional<Permissions> from_legion(Legion::PrivilegeMode p) {
-  switch (p) {
-    case LEGION_NO_ACCESS:
-      return Permissions::NONE;
-    case LEGION_READ_ONLY:
-      return Permissions::RO;
-    case LEGION_WRITE_ONLY:
-      return Permissions::WO;
-    case LEGION_READ_WRITE:
-      return Permissions::RW;
-    default:
-      return nullopt;
-  }
-}
-
 Permissions join(Permissions lhs, Permissions rhs) {
   if (lhs <= rhs) {
     return rhs;
diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc
similarity index 55%
rename from lib/runtime/src/task_spec/runtime_arg_ref.cc
rename to lib/local-execution/src/runtime_arg_ref.cc
index a0aa242ce6..df4f024f1d 100644
--- a/lib/runtime/src/task_spec/runtime_arg_ref.cc
+++ b/lib/local-execution/src/runtime_arg_ref.cc
@@ -1,5 +1,5 @@
-#include "runtime_arg_ref.h"
-#include "device_specific.h"
+#include "local-execution/runtime_arg_ref.h"
+#include "local-execution/device_specific.h"
 
 namespace FlexFlow {
 
@@ -11,4 +11,8 @@ RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
   return {RuntimeArgRefType::FF_HANDLE};
 }
 
+RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> iteration_config() {
+  return {RuntimeArgRefType::FF_ITERATION_CONFIG};
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index 6d06714252..68636906c3 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -1,4 +1,4 @@
-#include "tracked_allocator.h"
+#include "local-execution/tracked_allocator.h"
 #include "kernels/device.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc
new file mode 100644
index 0000000000..efd43a6648
--- /dev/null
+++ b/lib/local-execution/src/variadic_tensor_ref.cc
@@ -0,0 +1,9 @@
+#include "local-execution/variadic_tensor_ref.h"
+
+namespace FlexFlow {
+
+VariadicTensorRef<OpTensorSpec> get_input_tensors() {
+  return {VariadicTensorRefType::INPUT_TENSORS};
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/get_output_shapes.h b/lib/op-attrs/include/op-attrs/get_output_shapes.h
index 6fb93aac91..496cfbb755 100644
--- a/lib/op-attrs/include/op-attrs/get_output_shapes.h
+++ b/lib/op-attrs/include/op-attrs/get_output_shapes.h
@@ -128,9 +128,7 @@ ParallelTensorShape get_output_shape(DropoutAttrs const &,
 ParallelTensorShape get_output_shape(ElementBinaryAttrs const &,
                                      ParallelTensorShape const &,
                                      ParallelTensorShape const &);
-ParallelTensorShape get_output_shape(ElementUnaryAttrs const &,
-                                     ParallelTensorShape const &);
-ParallelTensorShape get_output_shape(ElementScalarUnaryAttrs const &,
+ParallelTensorShape get_output_shape(ElementUnaryUnifiedAttrs const &,
                                      ParallelTensorShape const &);
 ParallelTensorShape get_output_shape(EmbeddingAttrs const &,
                                      ParallelTensorShape const &);
@@ -153,6 +151,8 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &,
                                      ParallelTensorShape const &);
 ParallelTensorShape get_output_shape(ReplicateAttrs const &,
                                      ParallelTensorShape const &);
+ParallelTensorShape get_output_shape(ReshapeAttrs const &,
+                                     ParallelTensorShape const &);
 ParallelTensorShape get_output_shape(ReverseAttrs const &,
                                      ParallelTensorShape const &);
 std::vector<ParallelTensorShape> get_output_shapes(SplitAttrs const &,
diff --git a/lib/op-attrs/include/op-attrs/ops/element_unary.h b/lib/op-attrs/include/op-attrs/ops/element_unary.h
index 5e19b81c8c..6a80094dfa 100644
--- a/lib/op-attrs/include/op-attrs/ops/element_unary.h
+++ b/lib/op-attrs/include/op-attrs/ops/element_unary.h
@@ -21,6 +21,9 @@ struct ElementScalarUnaryAttrs {
 FF_VISITABLE_STRUCT(ElementScalarUnaryAttrs, op_type, scalar);
 CHECK_VALID_OP_ATTR(ElementScalarUnaryAttrs);
 
+using ElementUnaryUnifiedAttrs =
+    std::variant<ElementUnaryAttrs, ElementScalarUnaryAttrs>;
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index a46df59282..3b57a959b8 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -36,7 +36,11 @@ CHECK_VALID_OP_ATTR(LinearAttrs);
 
 TensorShape get_weights_shape(LinearAttrs const &attrs,
                               TensorShape const &input);
+ParallelTensorShape get_weights_shape(LinearAttrs const &attrs,
+                                      ParallelTensorShape const &input);
 TensorShape get_bias_shape(LinearAttrs const &attrs, TensorShape const &input);
+ParallelTensorShape get_bias_shape(LinearAttrs const &attrs,
+                                   ParallelTensorShape const &input);
 
 } // namespace FlexFlow
 
diff --git a/lib/runtime/include/runtime/task_spec/concrete_arg.h b/lib/runtime/include/runtime/task_spec/concrete_arg.h
deleted file mode 100644
index 1d973eb81a..0000000000
--- a/lib/runtime/include/runtime/task_spec/concrete_arg.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H
-#define _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H
-
-#include "arg_type_runtime_tag.h"
-#include "utils/type_index.h"
-#include <memory>
-
-namespace FlexFlow {
-
-struct ConcreteArgSpec {
-public:
-  ConcreteArgSpec() = delete;
-
-  template <typename T>
-  T const &get() {
-    assert(this->type_tag.matches<T>());
-
-    return *(T const *)ptr.get();
-  }
-
-  ArgTypeRuntimeTag get_type_tag() const {
-    return this->type_tag;
-  }
-  size_t serialize(Legion::Serializer &) const;
-
-  template <typename T>
-  static ConcreteArgSpec create(T const &t) {
-    static_assert(is_serializable<T>::value, "Type must be serializable");
-
-    return ConcreteArgSpec(type_index<T>(),
-                           std::make_shared<T>(t),
-                           ArgTypeRuntimeTag::create<T>());
-  }
-
-private:
-  ConcreteArgSpec(std::type_index,
-                  std::shared_ptr<void const>,
-                  ArgTypeRuntimeTag const &);
-
-  ArgTypeRuntimeTag type_tag;
-  std::shared_ptr<void const *> ptr;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/runtime/src/ops/gather.cc b/lib/runtime/src/ops/gather.cc
deleted file mode 100644
index 9ef53ffc6a..0000000000
--- a/lib/runtime/src/ops/gather.cc
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "gather.h"
-#include "embedding.h"
-#include "kernels/gather_kernels.h"
-#include "legion/legion_utilities.h"
-
-namespace FlexFlow {
-
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-using PCG::Node;
-
-using namespace FlexFlow::Kernels::Gather;
-
-GatherParams Gather::get_params() const {
-  GatherParams params;
-  params.legion_dim = this->legion_dim;
-  params.layer_guid = this->layer_guid;
-  return params;
-}
-
-Tensor FFModel::gather(const Tensor input,
-                       const Tensor index,
-                       int dim,
-                       char const *name) {
-  Layer *gather = new Layer(this,
-                            OP_GATHER,
-                            DT_FLOAT,
-                            name,
-                            2 /*inputs*/,
-                            0 /*weights*/,
-                            1 /*output*/,
-                            input,
-                            index);
-  assert(index->data_type == DT_INT32 || index->data_type == DT_INT64);
-  assert(input->num_dims == index->num_dims);
-  int legion_dim = input->num_dims - 1 - dim;
-  // https://pytorch.org/docs/stable/generated/torch.gather.html
-  // Currently we assume index.size(d) == input.size(d) for all
-  // dimensions d != dim, which is a stronger constraint that PyTorch's
-  for (int i = 0; i < input->num_dims; i++) {
-    if (i != legion_dim) {
-      assert(input->dims[i] == index->dims[i]);
-    }
-  }
-  int dims[MAX_TENSOR_DIM];
-  for (int i = 0; i < index->num_dims; i++) {
-    dims[i] = index->dims[i];
-  }
-  gather->outputs[0] = create_tensor_legion_ordering(
-      index->num_dims, dims, input->data_type, gather, 0, true /*create_grad*/);
-  gather->add_int_property("legion_dim", legion_dim);
-  layers.push_back(gather);
-  return gather->outputs[0];
-}
-
-Op *Gather::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("legion_dim", value);
-  int legion_dim = value;
-  return new Gather(
-      model, layer->layer_guid, inputs[0], inputs[1], legion_dim, layer->name);
-}
-
-Gather::Gather(FFModel &model,
-               GatherParams const &params,
-               std::pair<ParallelTensor, ParallelTensor> const &inputs,
-               char const *name)
-    : Gather(model,
-             params.layer_guid,
-             inputs.first,
-             inputs.second,
-             params.legion_dim,
-             name) {}
-
-Gather::Gather(FFModel &model,
-               LayerID const &_layer_guid,
-               const ParallelTensor input,
-               const ParallelTensor index,
-               int _legion_dim,
-               char const *name)
-    : Op(model,
-         OP_GATHER,
-         input->data_type,
-         name,
-         2 /*inputs*/,
-         0 /*weights*/,
-         1 /*outputs*/,
-         input,
-         index),
-      legion_dim(_legion_dim) {
-  layer_guid = _layer_guid;
-  // Assume that input and index have the same paralleldim except
-  // for the legion_dim-th dim, which cannot be parallelized
-  for (int i = 0; i < input->num_dims; i++) {
-    if (i != legion_dim) {
-      assert(input->dims[i] == index->dims[i]);
-    }
-  }
-  assert(index->dims[legion_dim].degree == 1);
-  assert(input->dims[legion_dim].degree == 1);
-  // output has the same parallel dims as index
-  ParallelDim dims[MAX_TENSOR_DIM];
-  for (int i = 0; i < index->num_dims; i++) {
-    dims[i] = index->dims[i];
-  }
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      index->num_dims, dims, input->data_type, this);
-}
-
-void Gather::serialize(Legion::Serializer &sez) const {
-  GatherParams params = get_params();
-  sez.serialize(params.legion_dim);
-  sez.serialize(this->layer_guid.id);
-}
-
-using PCG::Node;
-/*static*/
-Node Gather::deserialize(FFModel &ff,
-                         Legion::Deserializer &dez,
-                         ParallelTensor inputs[],
-                         int num_inputs) {
-  assert(num_inputs == 2);
-  int legion_dim;
-  dez.deserialize(legion_dim);
-  size_t id;
-  dez.deserialize(id);
-  LayerID layer_guid(id);
-
-  GatherParams params;
-  params.legion_dim = legion_dim;
-  params.layer_guid = layer_guid;
-  return ff.get_or_create_node<Gather>({inputs[0], inputs[1]}, params);
-}
-
-Op *Gather::materialize(FFModel &ff,
-                        ParallelTensor inputs[],
-                        int num_inputs) const {
-  GatherParams params = get_params();
-  return new Gather(ff, params, {inputs[0], inputs[1]}, this->name);
-}
-
-void Gather::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(GATHER_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(Gather)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
-
-PerDeviceOpState *Gather::init_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  Gather const *gather = (Gather const *)task->args;
-  FFHandler handle = *((FFHandler const *)task->local_args);
-  GatherMeta *m = new GatherMeta(handle, gather);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR index = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  assert(input.domain.get_dim() == index.domain.get_dim());
-  assert(output.domain.get_dim() == index.domain.get_dim());
-  for (int i = 0; i < input.domain.get_dim(); i++) {
-    assert(index.domain.hi()[i] == output.domain.hi()[i]);
-    assert(index.domain.lo()[i] == output.domain.lo()[i]);
-    if (i != m->legion_dim) {
-      assert(input.domain.hi()[i] == index.domain.hi()[i]);
-      assert(input.domain.lo()[i] == index.domain.lo()[i]);
-    }
-  }
-  return m;
-}
-
-void Gather::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(GATHER_FWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, false),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
-
-void Gather::forward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  GatherMeta const *m = *((GatherMeta **)task->local_args);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR index = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input, index, output);
-}
-
-void Gather::backward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
-  IndexLauncher launcher(GATHER_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
-  launcher.add_field(2, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
-
-void Gather::backward_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
-  assert(regions.size() == 3);
-  assert(task->regions.size() == 3);
-  GatherMeta const *m = *((GatherMeta **)task->local_args);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR index = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  backward_kernel_wrapper(m, output_grad, index, input_grad);
-}
-
-bool Gather::measure_operator_cost(Simulator *sim,
-                                   MachineView const &mv,
-                                   CostMetrics &cost_metrics) const {
-  ParallelTensorBase sub_input, sub_index, sub_output;
-  if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-    return false;
-  }
-  if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-    return false;
-  }
-  if (!inputs[1]->get_sub_tensor(mv, sub_index)) {
-    return false;
-  }
-  GatherMeta *m = new GatherMeta(sim->handler, this);
-  sim->free_all();
-  bool out_of_memory = false;
-  Domain input_domain = sub_input.get_domain();
-  void *input_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW input_acc(
-      inputs[0]->data_type, input_domain, input_ptr);
-  Domain index_domain = sub_index.get_domain();
-  void *index_ptr = sim->allocate(sub_index.get_volume(), inputs[1]->data_type);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW index_acc(
-      inputs[1]->data_type, index_domain, index_ptr);
-  out_of_memory = out_of_memory || (input_ptr == NULL) || (index_ptr == NULL);
-  Domain out_domain = sub_output.get_domain();
-  void *output_ptr =
-      sim->allocate(sub_output.get_volume(), outputs[0]->data_type);
-  out_of_memory = out_of_memory || (output_ptr == NULL);
-  cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW output_acc(
-      outputs[0]->data_type, out_domain, output_ptr);
-  if (out_of_memory) {
-    cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-    cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-    return true;
-  }
-
-  std::function<void()> forward, backward;
-  forward = [&] {
-    forward_kernel_wrapper(m, input_acc, index_acc, output_acc);
-  };
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    backward = [&] {
-      backward_kernel_wrapper(m, output_acc, index_acc, input_acc);
-    };
-  }
-
-  inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    printf("[Measure Gather] name(%s) forward_time(%.4lf) "
-           "backward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time,
-           cost_metrics.backward_time);
-  } else {
-    printf("[Measure Gather] name(%s) forward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time);
-  }
-  delete m;
-  return true;
-}
-
-}; // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::GatherParams>::operator()(
-    FlexFlow::GatherParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.legion_dim);
-  hash_combine(key, params.layer_guid.id);
-  return key;
-}
-}; // namespace std
diff --git a/lib/runtime/src/ops/gather.h b/lib/runtime/src/ops/gather.h
deleted file mode 100644
index 1ea20b71f5..0000000000
--- a/lib/runtime/src/ops/gather.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef _FLEXFLOW_OPS_GATHER_H
-#define _FLEXFLOW_OPS_GATHER_H
-
-#include "op-attrs/ops/gather.h"
-#include "op_task_invocation.h"
-#include "sim_environment.h"
-
-namespace FlexFlow {
-
-template <>
-void register_task<GATHER_INIT_TASK_ID>();
-template <>
-void register_task<GATHER_FWD_TASK_ID>();
-template <>
-void register_task<GATHER_BWD_TASK_ID>();
-
-OpTaskInvocation init(GatherAttrs const &);
-OpTaskInvocation forward(GatherAttrs const &);
-OpTaskInvocation backward(GatherAttrs const &);
-
-CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
-                                  GatherAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
-                                  ParallelTensorShape const &index_shape,
-                                  ProfilingSettings const &settings,
-                                  MachineView const &machine_view);
-
-/* class Gather : public Op { */
-/* public: */
-/*   Gather(FFModel &model, */
-/*          ParallelTensor const &input, */
-/*          ParallelTensor const &index, */
-/*          int legion_dim, */
-/*          char const *name = nullptr); */
-/*   void init(FFModel const &) override; */
-/*   void forward(FFModel const &) override; */
-/*   void backward(FFModel const &) override; */
-
-/*   static Op * */
-/*       create_operator_from_layer(FFModel &model, */
-/*                                  Layer const *layer, */
-/*                                  std::vector<ParallelTensor> const &inputs);
- */
-
-/*   static PerDeviceOpState *init_task(Legion::Task const *task, */
-/*                            std::vector<Legion::PhysicalRegion> const
- * &regions, */
-/*                            Legion::Context ctx, */
-/*                            Legion::Runtime *runtime); */
-/*   static void forward_task(Legion::Task const *task, */
-/*                            std::vector<Legion::PhysicalRegion> const
- * &regions, */
-/*                            Legion::Context ctx, */
-/*                            Legion::Runtime *runtime); */
-/*   static void backward_task(Legion::Task const *task, */
-/*                             std::vector<Legion::PhysicalRegion> const
- * &regions, */
-/*                             Legion::Context ctx, */
-/*                             Legion::Runtime *runtime); */
-/*   bool measure_operator_cost(Simulator *sim, */
-/*                              MachineView const &pc, */
-/*                              CostMetrics &cost_metrics) const override; */
-/*   void serialize(Legion::Serializer &s) const override; */
-/*   /1* static PCG::Node deserialize(FFModel &ff, *1/ */
-/*   /1*                              Legion::Deserializer &d, *1/ */
-/*   /1*                              ParallelTensor inputs[], *1/ */
-/*   /1*                              int num_inputs); *1/ */
-/*   Op *materialize(FFModel &ff, */
-/*                   ParallelTensor inputs[], */
-/*                   int num_inputs) const override; */
-
-/* public: */
-/*   int legion_dim; */
-/* }; */
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/runtime/src/task_spec/op_task_invocation.cc b/lib/runtime/src/task_spec/op_task_invocation.cc
deleted file mode 100644
index fbbfe47726..0000000000
--- a/lib/runtime/src/task_spec/op_task_invocation.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "op_task_invocation.h"
-#include "task_argument_accessor.h"
-
-namespace FlexFlow {
-
-OpTaskSignature get_signature(task_id_t const &) {
-  NOT_IMPLEMENTED();
-}
-
-OpTensorSpec::OpTensorSpec(TensorRole _role, int _idx)
-    : role(_role), idx(_idx) {}
-
-OpTensorSpec input_tensor(int idx) {
-  return {TensorRole::INPUT, idx};
-}
-
-OpTensorSpec output_tensor(int idx) {
-  return {TensorRole::OUTPUT, idx};
-}
-
-OpTensorSpec weight_tensor(int idx) {
-  return {TensorRole::WEIGHT, idx};
-}
-
-// OpTaskBinding::OpTaskBinding() {
-//   this->serializer.reserve_bytes(sizeof(TaskArgumentFormat));
-// }
-
-void OpTaskBinding::bind(slot_id slot, OpTensorSpec const &tensor_spec) {
-  this->tensor_bindings.insert({{slot, IsGrad::NO}, tensor_spec});
-}
-
-void OpTaskBinding::bind_grad(slot_id slot, OpTensorSpec const &tensor_spec) {
-  this->tensor_bindings.insert({{slot, IsGrad::YES}, tensor_spec});
-}
-
-std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
-    OpTaskBinding::get_tensor_bindings() const {
-  return this->tensor_bindings;
-}
-
-std::unordered_map<slot_id, OpTaskBinding::ArgSpec> const &
-    OpTaskBinding::get_arg_bindings() const {
-  return this->arg_bindings;
-}
-
-} // namespace FlexFlow
diff --git a/lib/runtime/src/task_spec/op_task_invocation.h b/lib/runtime/src/task_spec/op_task_invocation.h
deleted file mode 100644
index 56e709734e..0000000000
--- a/lib/runtime/src/task_spec/op_task_invocation.h
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H
-#define _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H
-
-#include "accessor.h"
-#include "index_task_invocation.h"
-#include "legion.h"
-#include "op_arg_ref.h"
-#include "op_task_signature.h"
-#include "op_tensor_spec.h"
-#include "runtime/config.h"
-#include "runtime/profiling.h"
-#include "serialization.h"
-#include "standard_task_invocation.h"
-#include "tasks.h"
-#include "utils/bidict.h"
-#include "utils/optional.h"
-#include "utils/stack_map.h"
-#include "variadic_tensor_ref.h"
-#include <typeindex>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace FlexFlow {
-
-enum class IsTrainable { YES, NO };
-
-using OpArgSpec = variant<ConcreteArgSpec,
-                          IndexArgSpec,
-                          OpArgRefSpec,
-                          CheckedTypedFuture,
-                          CheckedTypedFutureMap,
-                          RuntimeArgRefSpec,
-                          TaskInvocationSpec>;
-
-struct OpTaskBinding {
-  OpTaskBinding() = default;
-
-  static_assert(is_subeq_variant<IndexTaskArgSpec, OpArgSpec>::value, "");
-
-  void bind(slot_id, OpTensorSpec const &);
-  void bind_grad(slot_id, OpTensorSpec const &);
-
-  template <typename T>
-  void bind(slot_id name, VariadicTensorRef<T> const &t) {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  void bind_device_specific_arg(slot_id name, T const &t) {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  void bind_device_specific_arg(slot_id name, OpArgRef<T> const &t) {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  void bind_arg(slot_id name, T const &t) {
-    this->insert_arg_spec(name, ConcreteArgSpec::create(t));
-  }
-
-  template <typename T>
-  void bind_arg(slot_id name, RuntimeArgRef<T> const &ref) {
-    this->insert_arg_spec(name, RuntimeArgRefSpec::create(ref));
-  }
-
-  template <typename T>
-  void bind_arg(slot_id name, OpArgRef<T> const &ref) {
-    this->insert_arg_spec(name, OpArgRefSpec::create(ref));
-  }
-
-  template <typename T>
-  void bind_arg(slot_id name, TypedFuture<T> const &f) {
-    this->insert_arg_spec(name, CheckedTypedFuture::create(f));
-  }
-
-  template <typename T>
-  void bind_arg(slot_id name, TypedFutureMap<T> const &fm) {
-    this->insert_arg_spec(name, CheckedTypedFutureMap::create(fm));
-  }
-
-  std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> const &
-      get_tensor_bindings() const;
-  std::unordered_map<slot_id, OpArgSpec> const &get_arg_bindings() const;
-
-private:
-  void insert_arg_spec(slot_id name, OpArgSpec const &arg_spec) {
-    assert(!contains_key(this->arg_bindings, name));
-    this->arg_bindings.insert({name, arg_spec});
-  }
-
-  // template <typename T>
-  // ArgSpec generate_arg_spec(T const &t) {
-  //   static_assert(is_serializable<T>, "Type must be serializable");
-
-  //   size_t pre_size = serializer.get_used_bytes();
-  //   ff_task_serialize(serializer, t);
-  //   size_t post_size = serializer.get_used_bytes();
-  //   return {
-  //     typeid(T),
-  //     pre_size,
-  //     post_size - pre_size
-  //   };
-  // }
-
-  /* Legion::Serializer serializer; */
-  std::unordered_map<slot_id, OpArgSpec> arg_bindings;
-  std::unordered_map<std::pair<slot_id, IsGrad>, OpTensorSpec> tensor_bindings;
-};
-
-struct OpTaskInvocation : public use_visitable_cmp<OpTaskInvocation> {
-public:
-  OpTaskInvocation() = delete;
-  OpTaskInvocation(task_id_t const &task_id, OpTaskBinding const &binding)
-      : task_id(task_id), binding(binding) {}
-
-public:
-  task_id_t task_id;
-  OpTaskBinding binding;
-};
-
-OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd);
-OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
-OpTaskSignature get_op_signature(task_id_t const &);
-
-/* std::unordered_map<int, OpTensorSpec> get_regions_idxs(TaskArgumentFormat
- * const &); */
-
-/* TaskArgumentFormat compile_task_invocation(OpTaskSignature const &,
- * OpTaskBinding const &); */
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/runtime/src/task_spec/op_tensor_spec.h b/lib/runtime/src/task_spec/op_tensor_spec.h
deleted file mode 100644
index d859bb3072..0000000000
--- a/lib/runtime/src/task_spec/op_tensor_spec.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H
-
-#include "op_task_signature.h"
-
-namespace FlexFlow {
-
-struct OpTensorSpec {
-  TensorRole role;
-  req<int> idx;
-};
-FF_VISITABLE_STRUCT(OpTensorSpec, role, idx);
-
-OpTensorSpec input_tensor(int);
-OpTensorSpec output_tensor(int);
-OpTensorSpec weight_tensor(int);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
deleted file mode 100644
index 9cc05b8252..0000000000
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ /dev/null
@@ -1,193 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H
-
-#include "accessor.h"
-#include "device_specific.h"
-#include "realm_allocator.h"
-#include "runtime/config.h"
-#include "utils/exception.h"
-#include "utils/stack_map.h"
-#include "utils/strong_typedef.h"
-#include <vector>
-
-namespace FlexFlow {
-
-struct region_idx_t : strong_typedef<region_idx_t, int> {
-  using strong_typedef::strong_typedef;
-};
-
-FF_TYPEDEF_HASHABLE(region_idx_t);
-FF_TYPEDEF_PRINTABLE(region_idx_t, "region_idx");
-
-using NonvariadicFormat = region_idx_t;
-using VariadicFormat = std::vector<NonvariadicFormat>;
-
-using TensorArgumentFormat = variant<NonvariadicFormat, VariadicFormat>;
-
-bool is_variadic(TensorArgumentFormat const &);
-VariadicFormat get_variadic_format(TensorArgumentFormat const &);
-NonvariadicFormat get_nonvariadic_format(TensorArgumentFormat const &);
-
-struct TaskArgumentFormat {
-  std::type_index type;
-  size_t start;
-  req<size_t> end;
-};
-FF_VISITABLE_STRUCT(TaskArgumentFormat, type, start, end);
-
-struct FutureArgumentFormat {
-  std::type_index type;
-  req<size_t> future_idx;
-};
-FF_VISITABLE_STRUCT(FutureArgumentFormat, type, future_idx);
-
-struct TaskArgumentsFormat {
-  TaskArgumentsFormat() = default;
-
-  stack_map<slot_id, TensorArgumentFormat, MAX_NUM_TASK_REGIONS> region_idxs;
-  stack_map<slot_id, TaskArgumentFormat, MAX_NUM_TASK_ARGUMENTS> args;
-  stack_map<slot_id, FutureArgumentFormat, MAX_NUM_TASK_ARGUMENTS> futures;
-  stack_map<region_idx_t, Legion::PrivilegeMode, MAX_NUM_TASK_REGIONS> regions;
-  stack_map<region_idx_t, DataType, MAX_NUM_TASK_REGIONS> data_types;
-
-  void insert(std::pair<slot_id, TaskArgumentFormat> const &);
-  void insert(std::pair<slot_id, FutureArgumentFormat> const &);
-
-  void insert(region_idx_t, Legion::PrivilegeMode, DataType);
-  void insert(slot_id, region_idx_t);
-  void insert(slot_id, std::vector<region_idx_t> const &);
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(
-    TaskArgumentsFormat, region_idxs, args, futures, regions, data_types);
-
-Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &,
-                                     region_idx_t const &);
-Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &,
-                                     parallel_tensor_guid_t const &);
-Permissions get_permissions(TaskArgumentsFormat const &, region_idx_t const &);
-Permissions get_permissions(TaskArgumentsFormat const &,
-                            parallel_tensor_guid_t const &);
-region_idx_t get_region_idx(TaskArgumentsFormat const &,
-                            parallel_tensor_guid_t const &);
-DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &);
-
-struct TaskArgumentAccessor {
-  TaskArgumentAccessor(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime);
-
-  Allocator get_allocator() const {
-    return get_gpu_memory_allocator(this->task);
-  }
-
-  template <typename T>
-  T const &get_argument(slot_id slot) const {
-    NOT_IMPLEMENTED();
-    // TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
-    // std::type_index actual_type = arg_fmt.type;
-    // std::type_index requested_type = {typeid(T)};
-
-    // if (actual_type != requested_type) {
-    //   throw mk_runtime_error(
-    //       "Type mismatch in argument access (\"{}\" != \"{}\")",
-    //       actual_type.name(),
-    //       requested_type.name());
-    // }
-
-    // void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
-    // Legion::Deserializer dez(start_ptr, arg_fmt.start);
-
-    // return ff_task_deserialize<T>(dez);
-  }
-
-  template <typename T>
-  optional<T> get_optional_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  std::vector<T> get_variadic_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV>
-      get_generic_accessor(region_idx_t const &idx) const {
-    auto tensor_privs = get_permissions(this->args_fmt, idx);
-    if (tensor_privs != PRIV) {
-      throw mk_runtime_error(
-          "Privilege mismatch while accessing tensor: {} != {}",
-          tensor_privs,
-          PRIV);
-    }
-
-    return helperGetGenericTensorAccessor<PRIV>(
-        get_datatype(this->args_fmt, idx),
-        regions[idx.value()],
-        task->regions[idx.value()],
-        FID_DATA,
-        ctx,
-        runtime);
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
-    auto argument_format =
-        get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
-
-    return this->get_generic_accessor<PRIV>(argument_format);
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(slot_id slot) const {
-    std::vector<privilege_mode_to_accessor<PRIV>> result;
-
-    auto argument_format =
-        get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
-    for (NonvariadicFormat const &argument : argument_format) {
-      result.push_back(this->get_generic_accessor<PRIV>(argument));
-    }
-
-    return result;
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  T *unwrap(DeviceSpecific<T> const &arg) const {
-    return arg.get(this->get_device_idx());
-  }
-
-  template <typename T, typename... Args>
-  DeviceSpecific<T> create_device_specific(Args &&...args) const {
-    return DeviceSpecific<T>::create(this->get_device_idx(),
-                                     std::forward<Args>(args)...);
-  }
-
-  size_t get_device_idx() const {
-    NOT_IMPLEMENTED();
-  }
-
-private:
-  Legion::Task const *task;
-  std::vector<Legion::PhysicalRegion> const &regions;
-  Legion::Context ctx;
-  Legion::Runtime *runtime;
-  TaskArgumentsFormat const &args_fmt;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/runtime/src/task_spec/variadic_tensor_ref.h b/lib/runtime/src/task_spec/variadic_tensor_ref.h
deleted file mode 100644
index a9d1b54731..0000000000
--- a/lib/runtime/src/task_spec/variadic_tensor_ref.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H
-#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H
-
-#include "arg_ref.h"
-#include "op_tensor_spec.h"
-
-namespace FlexFlow {
-
-enum class VariadicTensorRefType { INPUT_TENSORS };
-
-template <typename T>
-using VariadicTensorRef = ArgRef<VariadicTensorRefType, T>;
-
-VariadicTensorRef<OpTensorSpec> get_input_tensors() {
-  return {VariadicTensorRefType::INPUT_TENSORS};
-}
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/utils/include/utils/type_index.h b/lib/utils/include/utils/type_index.h
index 49e893faa0..77a377a48d 100644
--- a/lib/utils/include/utils/type_index.h
+++ b/lib/utils/include/utils/type_index.h
@@ -3,17 +3,18 @@
 
 #include "fmt.h"
 #include <typeindex>
+#include <typeinfo>
 
 namespace FlexFlow {
 
 template <typename T>
-std::type_index type_index() {
+std::type_index get_type_index_for_type() {
   return std::type_index(typeid(T));
 }
 
 template <typename T>
 bool matches(std::type_index idx) {
-  return idx == type_index<T>();
+  return idx == get_type_index_for_type<T>();
 }
 
 } // namespace FlexFlow