From 109c9f4375d318abc04a8d1fee80edc2c916cc9e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 2 Aug 2023 00:30:50 +0000
Subject: [PATCH 01/13] start to local backend for aggregator

---
 lib/CMakeLists.txt                            | 10 +++---
 .../src/task_spec/task_argument_accessor.h    | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 57d9edab3d..93dbf50f80 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,8 +1,8 @@
-add_subdirectory(pcg)
-add_subdirectory(compiler)
-# add_subdirectory(runtime)
-add_subdirectory(op-attrs)
-add_subdirectory(kernels)
+#add_subdirectory(pcg)
+#add_subdirectory(compiler)
+ add_subdirectory(runtime)
+#add_subdirectory(op-attrs)
+#add_subdirectory(kernels)
 add_subdirectory(utils)
 # add_subdirectory(ffi)
 add_subdirectory(substitutions)
\ No newline at end of file
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index 4a4cf64512..7d323430db 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -70,6 +70,36 @@ region_idx_t get_region_idx(TaskArgumentsFormat const &,
                             parallel_tensor_guid_t const &);
 DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &);
 
+
+struct ITaskArgumentAccessor {
+template <typename T> T const &get_argument(slot_id slot) const;
+template <typename T> optional<T> get_optional_argument(slot_id slot) const;
+template <typename T> std::vector<T> get_variadic_argument(slot_id slot) const;
+template <Permissions PRIV> get_tensor(slot_id slot) const ;
+template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const;
+template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const;
+  template <Permissions PRIV>
+std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const;
+};
+
+struct ILegionTaskArgumentAccessor: public ITaskArgumentAccessor{
+
+private:
+  Legion::Task const *task;
+  std::vector<Legion::PhysicalRegion> const &regions;
+  Legion::Context ctx;
+  Legion::Runtime *runtime;
+  TaskArgumentsFormat const &args_fmt;
+};
+
+struct ILocalTaskArgumentAccessor: public ITaskArgumentAccessor{
+
+};
+
 struct TaskArgumentAccessor {
   TaskArgumentAccessor(Legion::Task const *task,
                        std::vector<Legion::PhysicalRegion> const &regions,
@@ -168,6 +198,8 @@ struct TaskArgumentAccessor {
   Legion::Context ctx;
   Legion::Runtime *runtime;
   TaskArgumentsFormat const &args_fmt;
+  //cow_ptr_t<IMultiDiGraph> const & ITaskArgumentAccesor; 
+  cow_ptr_t<ITaskArgumentAccessor> const & ptr;
 };
 
 } // namespace FlexFlow

From ad7b7f337b4fe1cb4656a37d4050d8c18acef071 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 2 Aug 2023 02:21:30 +0000
Subject: [PATCH 02/13] add ITaskArgumentAccessor, LocalTaskArgumentAccessor,
 LegionTaskArgumentAccessor

---
 lib/runtime/src/sim_environment.cc            |  26 ++
 lib/runtime/src/sim_environment.h             |  11 +-
 .../src/task_spec/task_argument_accessor.cc   |  39 +++
 .../src/task_spec/task_argument_accessor.h    | 294 ++++++++++++------
 4 files changed, 267 insertions(+), 103 deletions(-)
 create mode 100644 lib/runtime/src/sim_environment.cc
 create mode 100644 lib/runtime/src/task_spec/task_argument_accessor.cc

diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc
new file mode 100644
index 0000000000..a3a14a0b64
--- /dev/null
+++ b/lib/runtime/src/sim_environment.cc
@@ -0,0 +1,26 @@
+#include "sim_environment.h"
+
+namespaec FlexFlow {
+
+void SimTaskBinding::bind(slot_id id, ParallelTensorShape const & shape) {
+    tensor_shape_bindings.insert(id, shape);
+}
+void SimTaskBinding::bind(slot_id id, TensorShape const & shape) {
+    tensor_shape_bindings.insert(id, shape);
+}
+
+void SimTaskBinding::bind(slot_id  id, InputVariadicParallelTensorDesc const & desc) {
+    this->tensor_shape_bindings.insert(id, desc);
+}
+
+void SimTaskBinding::bind_arg(slot_id id, SimArg const & arg) {
+    arg_bindings.insert(id, arg);
+}
+
+TaskArgumentAccessor SimEnvironment::get_fwd_accessor(task_id_t tid, SimTaskBinding const & sim_task_binding) {
+    NOT_IMPLEMENTED();//TODO
+}
+
+
+
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/runtime/src/sim_environment.h b/lib/runtime/src/sim_environment.h
index d08bef653f..7af7a84125 100644
--- a/lib/runtime/src/sim_environment.h
+++ b/lib/runtime/src/sim_environment.h
@@ -4,9 +4,11 @@
 #include "cost_metrics.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
+#include "kernels/profiling.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "task_spec/op_task_invocation.h"
 #include "task_spec/task_argument_accessor.h"
+#include <unordered_map>
 #include <vector>
 
 namespace FlexFlow {
@@ -23,6 +25,9 @@ struct InputVariadicParallelTensorDesc {
   IsTrainable trainable;
 };
 
+using SimArg = variant<ProfilingSettings, AggregateAttrs>;
+using SimTensorSpec = variant<ParallelTensorShape, InputParallelTensorDesc, InputVariadicParallelTensorDesc>;
+
 struct SimTaskBinding {
   void bind(slot_id, ParallelTensorShape const &);
   void bind_untrainable(slot_id, ParallelTensorShape const &);
@@ -33,9 +38,11 @@ struct SimTaskBinding {
   void bind_untrainable(slot_id, std::vector<ParallelTensorShape> const &);
   void bind(slot_id, std::vector<ParallelTensorShape> const &, IsTrainable);
   void bind(slot_id, InputVariadicParallelTensorDesc const &);
+  
+  template <typename T> void bind_arg(slot_id id, T const & name);
 
-  template <typename T>
-  void bind_arg(slot_id, T const &);
+  std::unordered_map<slot_id, SimArg> arg_bindings;
+  std::unordered_map<slot_id, SimTensorSpec> tensor_shape_bindings;
 };
 
 SimTaskBinding infer_bwd_binding(SimTaskBinding const &);
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
new file mode 100644
index 0000000000..7427347456
--- /dev/null
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -0,0 +1,39 @@
+#include "task_argument_accessor.h"
+
+using namespace FlexFlow {
+
+template <typename T> T const & LocalTaskArgumentAccessor::get_argument(slot_id slot)  const{
+    if(slot == PROFILING) {
+        return get<ProfilingSettings>(this->arg_bindings.at(slot));
+    } elif (slot == ATTRS) {
+        return get<AggregateAttrs>(this->arg_bindings.at(slot));
+    } else {
+        throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_argument");
+    }
+}
+
+template <Permissions PRIV> privilege_mode_to_accessor<PRIV> LocalTaskArgumentAccessor::get_tensor(slot_id slot) const {
+    SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot);
+//     NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor<PRIV>
+//lib/runtime/src/accessor.h
+//     template <>
+// struct privilege_mode_to_accessor_t<Permissions::RW> {
+//   using type = GenericTensorAccessorW;
+// };
+
+// template <>
+// struct privilege_mode_to_accessor_t<Permissions::RO> {
+//   using type = GenericTensorAccessorR;
+// };
+
+// template <>
+// struct privilege_mode_to_accessor_t<Permissions::WO> {
+//   using type = GenericTensorAccessorW;
+// };
+
+// template <Permissions PRIV>
+// using privilege_mode_to_accessor =
+//     typename privilege_mode_to_accessor_t<PRIV>::type;
+}
+
+}//namespace FlexFlow
\ No newline at end of file
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index 7d323430db..16e34dca02 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -72,21 +72,83 @@ DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &);
 
 
 struct ITaskArgumentAccessor {
-template <typename T> T const &get_argument(slot_id slot) const;
-template <typename T> optional<T> get_optional_argument(slot_id slot) const;
-template <typename T> std::vector<T> get_variadic_argument(slot_id slot) const;
-template <Permissions PRIV> get_tensor(slot_id slot) const ;
-template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const;
-template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(slot_id slot) const;
-  template <Permissions PRIV>
-std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(slot_id slot) const;
+virtual template <typename T> T const &get_argument(slot_id slot) const = 0;
+
+
+virtual template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const = 0;
 };
 
-struct ILegionTaskArgumentAccessor: public ITaskArgumentAccessor{
+struct LegionTaskArgumentAccessor: public ITaskArgumentAccessor{
+
+public:
+virtual template <typename T> T const &get_argument(slot_id slot) const override;
+virtual template <Permissions PRIV> privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
+
+//   template <typename T>
+//   T const &get_argument(slot_id slot) const {
+//     TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
+//     std::type_index actual_type = arg_fmt.type;
+//     std::type_index requested_type = {typeid(T)};
+
+//     if (actual_type != requested_type) {
+//       throw mk_runtime_error(
+//           "Type mismatch in argument access (\"{}\" != \"{}\")",
+//           actual_type.name(),
+//           requested_type.name());
+//     }
+
+//     void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
+//     Legion::Deserializer dez(start_ptr, arg_fmt.size());
+
+//     return ff_task_deserialize<T>(dez);
+//   }
+
+LegionTaskArgumentAccessor(Legion::Task const *task,
+                       std::vector<Legion::PhysicalRegion> const &regions,
+                       Legion::Context ctx,
+                       Legion::Runtime *runtime):task(task), regions(regions), ctx(ctx), runtime(runtime){}
+
+// template <Permissions PRIV> privilege_mode_to_accessor<PRIV>
+//       get_generic_accessor(region_idx_t const &idx) const {
+//     auto tensor_privs = get_permissions(this->args_fmt, idx);
+//     if (tensor_privs != PRIV) {
+//       throw mk_runtime_error(
+//           "Privilege mismatch while accessing tensor: {} != {}",
+//           tensor_privs,
+//           PRIV);
+//     }
+
+//     return helperGetGenericTensorAccessor<PRIV>(
+//         get_datatype(this->args_fmt, idx),
+//         regions[idx.value()],
+//         task->regions[idx.value()],
+//         FID_DATA,
+//         ctx,
+//         runtime);
+//   }
+
+//     template <Permissions PRIV>
+//   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
+//     auto argument_format =
+//         get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
+
+//     return this->get_generic_accessor<PRIV>(argument_format);
+//   }
+
+//   template <Permissions PRIV>
+//   std::vector<privilege_mode_to_accessor<PRIV>>
+//       get_variadic_tensor(slot_id slot) const {
+//     std::vector<privilege_mode_to_accessor<PRIV>> result;
+
+//     auto argument_format =
+//         get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
+//     for (NonvariadicFormat const &argument : argument_format) {
+//       result.push_back(this->get_generic_accessor<PRIV>(argument));
+//     }
+
+//     return result;
+//   }
 
 private:
   Legion::Task const *task;
@@ -96,110 +158,140 @@ struct ILegionTaskArgumentAccessor: public ITaskArgumentAccessor{
   TaskArgumentsFormat const &args_fmt;
 };
 
-struct ILocalTaskArgumentAccessor: public ITaskArgumentAccessor{
+struct LocalTaskArgumentAccessor: public ITaskArgumentAccessor{
 
-};
+public:
+virtual template <typename T> T const &get_argument(slot_id slot) const override;
+virtual template <Permissions PRIV> privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
 
-struct TaskArgumentAccessor {
-  TaskArgumentAccessor(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime);
+LocalTaskArgumentAccessor(std::shared_ptr<SimTaskBinding const>  & sim_task_binding):sim_task_binding(sim_task_binding){}
 
-  template <typename T>
-  T const &get_argument(slot_id slot) const {
-    TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
-    std::type_index actual_type = arg_fmt.type;
-    std::type_index requested_type = {typeid(T)};
-
-    if (actual_type != requested_type) {
-      throw mk_runtime_error(
-          "Type mismatch in argument access (\"{}\" != \"{}\")",
-          actual_type.name(),
-          requested_type.name());
-    }
-
-    void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
-    Legion::Deserializer dez(start_ptr, arg_fmt.size());
-
-    return ff_task_deserialize<T>(dez);
-  }
+private:
+  std::shared_ptr<SimTaskBinding const> sim_task_binding;
 
-  template <typename T>
-  optional<T> get_optional_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
+};
 
-  template <typename T>
-  std::vector<T> get_variadic_argument(slot_id) const {
-    NOT_IMPLEMENTED();
+struct TaskArgumentAccessor {
+  // TaskArgumentAccessor(Legion::Task const *task,
+  //                      std::vector<Legion::PhysicalRegion> const &regions,
+  //                      Legion::Context ctx,
+  //                      Legion::Runtime *runtime);
+    template <typename T>
+  T const &get_argument(slot_id slot) const {
+    return this->ptr->get_argument<T>(slot);
   }
 
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV>
-      get_generic_accessor(region_idx_t const &idx) const {
-    auto tensor_privs = get_permissions(this->args_fmt, idx);
-    if (tensor_privs != PRIV) {
-      throw mk_runtime_error(
-          "Privilege mismatch while accessing tensor: {} != {}",
-          tensor_privs,
-          PRIV);
-    }
-
-    return helperGetGenericTensorAccessor<PRIV>(
-        get_datatype(this->args_fmt, idx),
-        regions[idx.value()],
-        task->regions[idx.value()],
-        FID_DATA,
-        ctx,
-        runtime);
-  }
 
-  template <Permissions PRIV>
+    template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
-    auto argument_format =
-        get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
-
-    return this->get_generic_accessor<PRIV>(argument_format);
+    return this->ptr->get_tensor<PRIV>(slot);
   }
 
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
 
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(slot_id slot) const {
-    std::vector<privilege_mode_to_accessor<PRIV>> result;
 
-    auto argument_format =
-        get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
-    for (NonvariadicFormat const &argument : argument_format) {
-      result.push_back(this->get_generic_accessor<PRIV>(argument));
-    }
-
-    return result;
+  // template <typename T>
+  // T const &get_argument(slot_id slot) const {
+  //   TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
+  //   std::type_index actual_type = arg_fmt.type;
+  //   std::type_index requested_type = {typeid(T)};
+
+  //   if (actual_type != requested_type) {
+  //     throw mk_runtime_error(
+  //         "Type mismatch in argument access (\"{}\" != \"{}\")",
+  //         actual_type.name(),
+  //         requested_type.name());
+  //   }
+
+  //   void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
+  //   Legion::Deserializer dez(start_ptr, arg_fmt.size());
+
+  //   return ff_task_deserialize<T>(dez);
+  // }
+
+  // template <typename T>
+  // optional<T> get_optional_argument(slot_id) const {
+  //   NOT_IMPLEMENTED();
+  // }
+
+  // template <typename T>
+  // std::vector<T> get_variadic_argument(slot_id) const {
+  //   NOT_IMPLEMENTED();
+  // }
+
+  // template <Permissions PRIV>
+  // privilege_mode_to_accessor<PRIV>
+  //     get_generic_accessor(region_idx_t const &idx) const {
+  //   auto tensor_privs = get_permissions(this->args_fmt, idx);
+  //   if (tensor_privs != PRIV) {
+  //     throw mk_runtime_error(
+  //         "Privilege mismatch while accessing tensor: {} != {}",
+  //         tensor_privs,
+  //         PRIV);
+  //   }
+
+  //   return helperGetGenericTensorAccessor<PRIV>(
+  //       get_datatype(this->args_fmt, idx),
+  //       regions[idx.value()],
+  //       task->regions[idx.value()],
+  //       FID_DATA,
+  //       ctx,
+  //       runtime);
+  // }
+
+  // template <Permissions PRIV>
+  // privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
+  //   auto argument_format =
+  //       get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
+
+  //   return this->get_generic_accessor<PRIV>(argument_format);
+  // }
+
+  // template <Permissions PRIV>
+  // privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
+  //   NOT_IMPLEMENTED();
+  // }
+
+  // template <Permissions PRIV>
+  // std::vector<privilege_mode_to_accessor<PRIV>>
+  //     get_variadic_tensor(slot_id slot) const {
+  //   std::vector<privilege_mode_to_accessor<PRIV>> result;
+
+  //   auto argument_format =
+  //       get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
+  //   for (NonvariadicFormat const &argument : argument_format) {
+  //     result.push_back(this->get_generic_accessor<PRIV>(argument));
+  //   }
+
+  //   return result;
+  // }
+
+  // template <Permissions PRIV>
+  // std::vector<privilege_mode_to_accessor<PRIV>>
+  //     get_variadic_tensor_grad(slot_id slot) const {
+  //   NOT_IMPLEMENTED();
+  // }
+
+  // size_t get_device_idx() const {
+  //   NOT_IMPLEMENTED();
+  // }
+  template <typename T, typename... Args>
+  static
+      typename std::enable_if<std::is_base_of<ITaskArgumentAccessor, T>::value,
+                              TaskArgumentAccessor>::type
+      create(Args &&...args) {
+    return TaskArgumentAccessor(
+        std::make_shared<T>(std::forward<Args>(args)...));
   }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
-
-  size_t get_device_idx() const {
-    NOT_IMPLEMENTED();
-  }
-
 private:
-  Legion::Task const *task;
-  std::vector<Legion::PhysicalRegion> const &regions;
-  Legion::Context ctx;
-  Legion::Runtime *runtime;
-  TaskArgumentsFormat const &args_fmt;
+  // Legion::Task const *task;
+  // std::vector<Legion::PhysicalRegion> const &regions;
+  // Legion::Context ctx;
+  // Legion::Runtime *runtime;
+  // TaskArgumentsFormat const &args_fmt;
   //cow_ptr_t<IMultiDiGraph> const & ITaskArgumentAccesor; 
-  cow_ptr_t<ITaskArgumentAccessor> const & ptr;
+  //cow_ptr_t<ITaskArgumentAccessor> const & ptr;
+  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> & ptr): ptr(ptr) {}
+  std::shared_ptr<ITaskArgumentAccessor const> ptr; 
 };
 
 } // namespace FlexFlow

From f106c9a2af394b63570abd586688c8c01e6d90bd Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Thu, 3 Aug 2023 11:16:14 +0000
Subject: [PATCH 03/13] implement some logic
 LocalTaskArgumentAccessor::get_tensor

---
 lib/runtime/src/ops/aggregate.cc              |  2 ++
 .../src/task_spec/task_argument_accessor.cc   | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/lib/runtime/src/ops/aggregate.cc b/lib/runtime/src/ops/aggregate.cc
index e9874c877a..c1136f513e 100644
--- a/lib/runtime/src/ops/aggregate.cc
+++ b/lib/runtime/src/ops/aggregate.cc
@@ -467,6 +467,8 @@ CostMetrics
 
   fwd_binding.bind_arg(PROFILING, settings);
 
+  fwd_binding.bind_arg(ATTRS, attrs);
+
   auto fwd_accessor = env.get_fwd_accessor(AGGREGATE_FWD_TASK_ID, fwd_binding);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index 7427347456..c4fb086a52 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -14,6 +14,25 @@ template <typename T> T const & LocalTaskArgumentAccessor::get_argument(slot_id
 
 template <Permissions PRIV> privilege_mode_to_accessor<PRIV> LocalTaskArgumentAccessor::get_tensor(slot_id slot) const {
     SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot);
+    if(slot == GATE_PREDS) {
+        InputParallelTensorDesc  gate_preds = get<InputParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
+        //TODO, I should convert gate_pred to privilege_mode_to_accessor<PRIV>
+        NOT_IMPLEMENTED();
+    } else if(slot == GATE_ASSIGN) {
+        InputVariadicParallelTensorDesc  gate_assign = get<InputVariadicParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
+        //TODO, I should convert gate_assign to privilege_mode_to_accessor<PRIV>
+        NOT_IMPLEMENTED();
+    } else if(slot == EXP_PREDS) {
+        InputVariadicParallelTensorDesc exp_preds = get<InputVariadicParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
+        //TODO, I should convert exp_preds to privilege_mode_to_accessor<PRIV>
+        NOT_IMPLEMENTED();
+    } else if(slot == OUTPUT) {
+        ParallelTensorShape  output_shape = get<ParallelTensorShape>(this->sim_task_binding->tensor_shape_bindings.at(slot));
+        //TODO, I should convert output_shape to privilege_mode_to_accessor<PRIV>
+        NOT_IMPLEMENTED();
+    } else {
+        throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
+    }
 //     NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor<PRIV>
 //lib/runtime/src/accessor.h
 //     template <>

From 3e578ccf309ca47962c992372615ea19236debe1 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 8 Aug 2023 09:38:19 +0000
Subject: [PATCH 04/13] add arrayshape

---
 lib/kernels/src/array_shape.cc                       | 3 +++
 lib/op-attrs/include/op-attrs/parallel_tensor_dims.h | 2 +-
 lib/runtime/src/task_spec/task_argument_accessor.cc  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 44507c14c4..f7f1f19abc 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -10,4 +10,7 @@ std::size_t ArrayShape::get_volume() const {
   return product(this->dims);
 }
 
+ArrayShape::ArrayShape(std::vector<std::size_t> const &_dims)
+    : dims(_dims) {}
+
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index d38ba75232..fba1202734 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -25,7 +25,7 @@ struct ParallelTensorDims : public use_visitable_cmp<ParallelTensorDims> {
   ParallelDim &at(ff_dim_t const &);
 
   iterator begin();
-  const_iterator begin() const;
+  const_iterator begin() const; 
   const_iterator cbegin() const;
   iterator end();
   const_iterator end() const;
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index c4fb086a52..de2ee3797a 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -16,6 +16,8 @@ template <Permissions PRIV> privilege_mode_to_accessor<PRIV> LocalTaskArgumentAc
     SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot);
     if(slot == GATE_PREDS) {
         InputParallelTensorDesc  gate_preds = get<InputParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
+        DataType shape = gate_preds.shape;
+        //use gate_preds.shape to get the ArrayShape
         //TODO, I should convert gate_pred to privilege_mode_to_accessor<PRIV>
         NOT_IMPLEMENTED();
     } else if(slot == GATE_ASSIGN) {

From 6b7d2b4b34029461d2fb84db848e5a65e012c0f7 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 8 Aug 2023 09:44:14 +0000
Subject: [PATCH 05/13] construct ArrayShape in
 LocalTaskArgumentAccessor::get_tensor

---
 lib/op-attrs/include/op-attrs/parallel_tensor_dims.h | 1 +
 lib/op-attrs/src/parallel_tensor_shape.cc            | 8 ++++++++
 lib/runtime/src/task_spec/task_argument_accessor.cc  | 4 ++++
 3 files changed, 13 insertions(+)

diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index fba1202734..58eae651da 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -11,6 +11,7 @@ struct ParallelTensorDims : public use_visitable_cmp<ParallelTensorDims> {
 
   size_t get_volume() const;
   size_t num_dims() const;
+  std::vector<size_t> get_dims() const; 
 
   using iterator = typename FFOrdered<ParallelDim>::iterator;
   using const_iterator = typename FFOrdered<ParallelDim>::const_iterator;
diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc
index 9a36e7d11b..4556f624f2 100644
--- a/lib/op-attrs/src/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/parallel_tensor_shape.cc
@@ -16,6 +16,14 @@ static std::vector<ParallelDim> lift_dims(TensorDims const &dims) {
 ParallelTensorDims::ParallelTensorDims(TensorDims const &dims)
     : data(lift_dims(dims)) {}
 
+std::vector<size_t> ParallelTensorDims::get_dims() const {
+  std::vector<size_t> dims;
+  for (ParallelDim const &d : this->data) {
+    dims.push_back(d.size);
+  }
+  return dims;
+}
+
 ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape)
     : dims(tensor_shape.dims), data_type(tensor_shape.data_type) {}
 
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index de2ee3797a..794e130a88 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -17,6 +17,10 @@ template <Permissions PRIV> privilege_mode_to_accessor<PRIV> LocalTaskArgumentAc
     if(slot == GATE_PREDS) {
         InputParallelTensorDesc  gate_preds = get<InputParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
         DataType shape = gate_preds.shape;
+        ArrayShape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector<size_t>
+        //TODO: 1)allocate memory for ptr 2)fill ptr 
+        //question: 1) how much memory should I allocate? 2) how to fill ptr?
+        
         //use gate_preds.shape to get the ArrayShape
         //TODO, I should convert gate_pred to privilege_mode_to_accessor<PRIV>
         NOT_IMPLEMENTED();

From 062a52ab2c7aabbf6905f09273c131be614d9e28 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 8 Aug 2023 12:30:23 +0000
Subject: [PATCH 06/13]  add allocate for LocalTaskArgumentAccessor

---
 lib/kernels/src/allocation.cc                 | 13 +++++++
 lib/kernels/src/cuda/accessor.cc              | 22 +++++++++++
 .../include/op-attrs/parallel_tensor_shape.h  |  3 +-
 lib/op-attrs/src/datatype.cc                  | 23 +++++++++++
 lib/op-attrs/src/parallel_tensor_shape.cc     | 10 +++++
 lib/runtime/src/cuda_allocator.cc             | 17 +++++++++
 lib/runtime/src/cuda_allocator.h              | 19 ++++++++++
 lib/runtime/src/realm_allocator.h             |  3 +-
 .../src/task_spec/task_argument_accessor.cc   | 38 +++++++++++--------
 .../src/task_spec/task_argument_accessor.h    | 12 +++++-
 10 files changed, 138 insertions(+), 22 deletions(-)
 create mode 100644 lib/kernels/src/allocation.cc
 create mode 100644 lib/kernels/src/cuda/accessor.cc
 create mode 100644 lib/op-attrs/src/datatype.cc
 create mode 100644 lib/runtime/src/cuda_allocator.cc
 create mode 100644 lib/runtime/src/cuda_allocator.h

diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
new file mode 100644
index 0000000000..ee9366f045
--- /dev/null
+++ b/lib/kernels/src/allocation.cc
@@ -0,0 +1,13 @@
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+void * Allocator::allocate(size_t size) {
+  return i_allocator->allocate(size);
+}
+
+void Allocator::deallocate(void *ptr) {
+  i_allocator->deallocate(ptr);
+}
+
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/kernels/src/cuda/accessor.cc b/lib/kernels/src/cuda/accessor.cc
new file mode 100644
index 0000000000..4fe9a8b568
--- /dev/null
+++ b/lib/kernels/src/cuda/accessor.cc
@@ -0,0 +1,22 @@
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+
+int32_t *get_int32_ptr(GenericTensorAccessorW const & w) {
+      return static_cast<int32_t*>(w.ptr.value()); //Note(lambda):we use static_cast, may have some problem
+}
+
+int64_t *get_int64_ptr(GenericTensorAccessorW const &w) {
+    return static_cast<int64_t*>(w.ptr.value()); 
+}
+float *get_float_ptr(GenericTensorAccessorW const & w) {
+    return static_cast<float*>(w.ptr.value()); 
+}
+double *get_double_ptr(GenericTensorAccessorW const &w) {
+    return static_cast<double*>(w.ptr.value()); 
+}
+half *get_half_ptr(GenericTensorAccessorW const & w) {
+    return static_cast<half*>(w.ptr.value()); 
+}
+
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index fd560352bb..eef0034ad5 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -25,8 +25,7 @@ struct ParallelTensorShape : public use_visitable_cmp<ParallelTensorShape> {
 
   ParallelTensorShape(TensorShape const &);
 
-  int num_dims() const;
-
+  int num_dims() const;  
   ParallelDim const &at(ff_dim_t const &) const;
   ParallelDim &at(ff_dim_t const &);
   ParallelDim const &operator[](ff_dim_t const &) const;
diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc
new file mode 100644
index 0000000000..e92563b628
--- /dev/null
+++ b/lib/op-attrs/src/datatype.cc
@@ -0,0 +1,23 @@
+#include "op-attrs/datatype.h"
+
+namespace FlexFlow {
+
+size_t size_of(DataType data_type);
+      switch (type) {
+    case DataType::BOOL:
+      return sizeof(bool);
+    case DataType::INT32:
+      return sizeof(int32_t);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::HALF:
+      return sizeof(float) / 2;
+    case DataType::FLOAT:
+      return sizeof(float);
+    case DataType::DOUBLE:
+      return sizeof(double);
+    default:
+        throw mk_runtime_error("Unknown data type");
+  }
+
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc
index 4556f624f2..184a695da5 100644
--- a/lib/op-attrs/src/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/parallel_tensor_shape.cc
@@ -24,6 +24,12 @@ std::vector<size_t> ParallelTensorDims::get_dims() const {
   return dims;
 }
 
+size_t ParallelTensorDims::get_volume() const {
+  return product(transform(this->data, [](ParallelDim const &d) -> size_t {
+    return d.size;
+  }));
+}
+
 ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape)
     : dims(tensor_shape.dims), data_type(tensor_shape.data_type) {}
 
@@ -31,6 +37,10 @@ int get_num_replica_dims(ParallelTensorShape const &shape) {
   return count(shape.dims, is_replica_dim);
 }
 
+TensorShape get_piece_shape(ParallelTensorShape const & parall_tensor_shape) {
+  return TensorShape(parall_tensor_shape.dims, parall_tensor_shape.data_type);
+}
+
 int get_num_replicas(ParallelTensorShape const &shape) {
   return product(
       transform(filter(as_vector(shape.dims), is_replica_dim),
diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc
new file mode 100644
index 0000000000..41db98926e
--- /dev/null
+++ b/lib/runtime/src/cuda_allocator.cc
@@ -0,0 +1,17 @@
+#include "cuda_allocator.h"
+
+namespace FlexFlow {
+
+void * CudaAllocator::allocate(size_t size) {
+  void *ptr;
+  check_CUDA(cudaMalloc(&ptr, size));
+  return ptr;
+}
+
+void CudaAllocator::deallocate(void *ptr) {
+  check_CUDA(cudaFree(ptr));
+}
+
+} // namespace FlexFlow
+
+
diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h
new file mode 100644
index 0000000000..4eea9b4207
--- /dev/null
+++ b/lib/runtime/src/cuda_allocator.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_RUNTIME_CUDA_ALLOCATOR_H
+#define _FLEXFLOW_RUNTIME_CUDA_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+#include <cuda_runtime.h>
+
+namespace FlexFlow {
+
+struct CudaAllocator : public IAllocator {
+  ~CudaAllocator() override;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+
+};
+
+} // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/lib/runtime/src/realm_allocator.h b/lib/runtime/src/realm_allocator.h
index 210fd8a050..99ba8a8252 100644
--- a/lib/runtime/src/realm_allocator.h
+++ b/lib/runtime/src/realm_allocator.h
@@ -15,8 +15,7 @@ struct RealmAllocator : public IAllocator {
   ~RealmAllocator() override;
 
   void *allocate(size_t) override;
-  void deallocate(void *) override;
-
+  void deallocate(void *) override
 private:
   Legion::Memory memory;
   stack_vector<Realm::RegionInstance, MAX_INSTANCE_ALLOCATIONS> instances;
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index 794e130a88..b777afb637 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -12,32 +12,38 @@ template <typename T> T const & LocalTaskArgumentAccessor::get_argument(slot_id
     }
 }
 
+void * LocalTaskArgumentAccessor::allocate(size_t size) {
+        void * ptr = local_allocator.allocate(memory_size);//Note: how(when) to free this memory? 
+        void * cpu_ptr = malloc(memory_size);
+        memset(cpu_ptr, 0, memory_size);
+        checkCUDA(cudaMemcpy(ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); //fill ptr
+        free(cpu_ptr);
+        return ptr;
+}
+
+
 template <Permissions PRIV> privilege_mode_to_accessor<PRIV> LocalTaskArgumentAccessor::get_tensor(slot_id slot) const {
     SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot);
-    if(slot == GATE_PREDS) {
+    if(slot == GATE_PREDS || slot == GATE_ASSIGN ) {
         InputParallelTensorDesc  gate_preds = get<InputParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
-        DataType shape = gate_preds.shape;
-        ArrayShape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector<size_t>
-        //TODO: 1)allocate memory for ptr 2)fill ptr 
-        //question: 1) how much memory should I allocate? 2) how to fill ptr?
-        
-        //use gate_preds.shape to get the ArrayShape
-        //TODO, I should convert gate_pred to privilege_mode_to_accessor<PRIV>
-        NOT_IMPLEMENTED();
-    } else if(slot == GATE_ASSIGN) {
-        InputVariadicParallelTensorDesc  gate_assign = get<InputVariadicParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
-        //TODO, I should convert gate_assign to privilege_mode_to_accessor<PRIV>
-        NOT_IMPLEMENTED();
+        DataType data_type = gate_preds.shape.data_type;
+        ArrayShape array_shape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector<size_t>
+        size_t shape_size =gate_preds.shape.dims.get_volume()* size_of(shape);
+        void * ptr = allocate(shape_size);
+        return gate_preds_accessor{shape, array_shape, ptr};
     } else if(slot == EXP_PREDS) {
         InputVariadicParallelTensorDesc exp_preds = get<InputVariadicParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
         //TODO, I should convert exp_preds to privilege_mode_to_accessor<PRIV>
         NOT_IMPLEMENTED();
     } else if(slot == OUTPUT) {
         ParallelTensorShape  output_shape = get<ParallelTensorShape>(this->sim_task_binding->tensor_shape_bindings.at(slot));
-        //TODO, I should convert output_shape to privilege_mode_to_accessor<PRIV>
-        NOT_IMPLEMENTED();
+        Datatype data_type = output_shape.data_type;
+        ArrayShape array_shape = {output_shape.dims.get_dims()};//output_shape.dims.get_dims() return std::vector<size_t>
+        size_t shape_size = output_shape.dims.get_volume() * size_of(data_type);
+        void * ptr = allocate(shape_size);
+        return {shape, array_shape, ptr};
     } else {
-        throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
+        throw mk_runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
     }
 //     NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor<PRIV>
 //lib/runtime/src/accessor.h
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index 16e34dca02..0229ccd50b 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -7,6 +7,7 @@
 #include "utils/exception.h"
 #include "utils/stack_map.h"
 #include "utils/strong_typedef.h"
+#include "kernels/allocation.h"
 #include <vector>
 
 namespace FlexFlow {
@@ -164,11 +165,18 @@ struct LocalTaskArgumentAccessor: public ITaskArgumentAccessor{
 virtual template <typename T> T const &get_argument(slot_id slot) const override;
 virtual template <Permissions PRIV> privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
 
-LocalTaskArgumentAccessor(std::shared_ptr<SimTaskBinding const>  & sim_task_binding):sim_task_binding(sim_task_binding){}
+LocalTaskArgumentAccessor(std::shared_ptr<SimTaskBinding const>  & sim_task_binding):sim_task_binding(sim_task_binding){
+  local_allocator = Allocator::create<CudaAllocator>();
+}
+
+void * allocate(size_t size);
+
+void deallocate(void * ptr);
 
 private:
   std::shared_ptr<SimTaskBinding const> sim_task_binding;
-
+ // CudaAllocator local_allocator;
+ Allocator local_allocator;
 };
 
 struct TaskArgumentAccessor {

From 7ef9afc38de8081bd77ada0fd20824d6995d577c Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 9 Aug 2023 01:56:52 +0000
Subject: [PATCH 07/13] add get_variadic_tensor  method for
 ITaskArgumentAccessor

---
 lib/CMakeLists.txt                            |   4 +-
 lib/kernels/src/allocation.cc                 |   2 +-
 lib/kernels/src/array_shape.cc                |   3 +-
 lib/kernels/src/cuda/accessor.cc              |  17 +-
 .../include/op-attrs/parallel_tensor_dims.h   |   4 +-
 .../include/op-attrs/parallel_tensor_shape.h  |   2 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |   2 +-
 lib/op-attrs/src/datatype.cc                  |   7 +-
 lib/op-attrs/src/parallel_tensor_shape.cc     |  12 +-
 lib/runtime/src/cuda_allocator.cc             |   4 +-
 lib/runtime/src/cuda_allocator.h              |   1 -
 lib/runtime/src/realm_allocator.h             |   4 +-
 lib/runtime/src/sim_environment.cc            |  26 +-
 lib/runtime/src/sim_environment.h             |   9 +-
 .../src/task_spec/task_argument_accessor.cc   | 126 ++++-----
 .../src/task_spec/task_argument_accessor.h    | 239 ++++--------------
 16 files changed, 167 insertions(+), 295 deletions(-)

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index e137f0dfc3..ece639cb6a 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,8 +1,8 @@
 #add_subdirectory(pcg)
 #add_subdirectory(compiler)
 add_subdirectory(runtime)
-#add_subdirectory(op-attrs)
-#add_subdirectory(kernels)
+add_subdirectory(op-attrs)
+add_subdirectory(kernels)
 add_subdirectory(utils)
 # add_subdirectory(ffi)
 #add_subdirectory(substitutions)
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index ee9366f045..eb90b0cb69 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 
-void * Allocator::allocate(size_t size) {
+void *Allocator::allocate(size_t size) {
   return i_allocator->allocate(size);
 }
 
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index f7f1f19abc..67c3de54dd 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -10,7 +10,6 @@ std::size_t ArrayShape::get_volume() const {
   return product(this->dims);
 }
 
-ArrayShape::ArrayShape(std::vector<std::size_t> const &_dims)
-    : dims(_dims) {}
+ArrayShape::ArrayShape(std::vector<std::size_t> const &_dims) : dims(_dims) {}
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/accessor.cc b/lib/kernels/src/cuda/accessor.cc
index 4fe9a8b568..1559c10350 100644
--- a/lib/kernels/src/cuda/accessor.cc
+++ b/lib/kernels/src/cuda/accessor.cc
@@ -2,21 +2,22 @@
 
 namespace FlexFlow {
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const & w) {
-      return static_cast<int32_t*>(w.ptr.value()); //Note(lambda):we use static_cast, may have some problem
+int32_t *get_int32_ptr(GenericTensorAccessorW const &w) {
+  return static_cast<int32_t *>(
+      w.ptr.value()); // Note(lambda):we use static_cast, may have some problem
 }
 
 int64_t *get_int64_ptr(GenericTensorAccessorW const &w) {
-    return static_cast<int64_t*>(w.ptr.value()); 
+  return static_cast<int64_t *>(w.ptr.value());
 }
-float *get_float_ptr(GenericTensorAccessorW const & w) {
-    return static_cast<float*>(w.ptr.value()); 
+float *get_float_ptr(GenericTensorAccessorW const &w) {
+  return static_cast<float *>(w.ptr.value());
 }
 double *get_double_ptr(GenericTensorAccessorW const &w) {
-    return static_cast<double*>(w.ptr.value()); 
+  return static_cast<double *>(w.ptr.value());
 }
-half *get_half_ptr(GenericTensorAccessorW const & w) {
-    return static_cast<half*>(w.ptr.value()); 
+half *get_half_ptr(GenericTensorAccessorW const &w) {
+  return static_cast<half *>(w.ptr.value());
 }
 
 } // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index 58eae651da..aeec0c6e01 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -11,7 +11,7 @@ struct ParallelTensorDims : public use_visitable_cmp<ParallelTensorDims> {
 
   size_t get_volume() const;
   size_t num_dims() const;
-  std::vector<size_t> get_dims() const; 
+  std::vector<size_t> get_dims() const;
 
   using iterator = typename FFOrdered<ParallelDim>::iterator;
   using const_iterator = typename FFOrdered<ParallelDim>::const_iterator;
@@ -26,7 +26,7 @@ struct ParallelTensorDims : public use_visitable_cmp<ParallelTensorDims> {
   ParallelDim &at(ff_dim_t const &);
 
   iterator begin();
-  const_iterator begin() const; 
+  const_iterator begin() const;
   const_iterator cbegin() const;
   iterator end();
   const_iterator end() const;
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index eef0034ad5..182b2169c3 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -25,7 +25,7 @@ struct ParallelTensorShape : public use_visitable_cmp<ParallelTensorShape> {
 
   ParallelTensorShape(TensorShape const &);
 
-  int num_dims() const;  
+  int num_dims() const;
   ParallelDim const &at(ff_dim_t const &) const;
   ParallelDim &at(ff_dim_t const &);
   ParallelDim const &operator[](ff_dim_t const &) const;
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index fa34860817..ab0f958b42 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -16,7 +16,7 @@ struct TensorShape : public use_visitable_cmp<TensorShape> {
 
   template <typename Dims>
   TensorShape(Dims const &dims, DataType data_type)
-      : dims(dims), data_type(data_type) {}
+      : dims(this->dims), data_type(this->data_type) {}
 
   size_t at(ff_dim_t) const;
   size_t operator[](ff_dim_t) const;
diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc
index e92563b628..fc47ef47eb 100644
--- a/lib/op-attrs/src/datatype.cc
+++ b/lib/op-attrs/src/datatype.cc
@@ -2,8 +2,8 @@
 
 namespace FlexFlow {
 
-size_t size_of(DataType data_type);
-      switch (type) {
+size_t size_of(DataType data_type) {
+  switch (data_type) {
     case DataType::BOOL:
       return sizeof(bool);
     case DataType::INT32:
@@ -17,7 +17,8 @@ size_t size_of(DataType data_type);
     case DataType::DOUBLE:
       return sizeof(double);
     default:
-        throw mk_runtime_error("Unknown data type");
+      throw mk_runtime_error("Unknown data type");
   }
+}
 
 } // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc
index 184a695da5..0f4121973f 100644
--- a/lib/op-attrs/src/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/parallel_tensor_shape.cc
@@ -25,9 +25,13 @@ std::vector<size_t> ParallelTensorDims::get_dims() const {
 }
 
 size_t ParallelTensorDims::get_volume() const {
-  return product(transform(this->data, [](ParallelDim const &d) -> size_t {
-    return d.size;
-  }));
+
+  // this function can use contains.h to optimize the code
+  size_t volume = 1;
+  for (ParallelDim const &d : this->data) {
+    volume *= d.size;
+  }
+  return volume;
 }
 
 ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape)
@@ -37,7 +41,7 @@ int get_num_replica_dims(ParallelTensorShape const &shape) {
   return count(shape.dims, is_replica_dim);
 }
 
-TensorShape get_piece_shape(ParallelTensorShape const & parall_tensor_shape) {
+TensorShape get_piece_shape(ParallelTensorShape const &parall_tensor_shape) {
   return TensorShape(parall_tensor_shape.dims, parall_tensor_shape.data_type);
 }
 
diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc
index 41db98926e..5a4529c8ed 100644
--- a/lib/runtime/src/cuda_allocator.cc
+++ b/lib/runtime/src/cuda_allocator.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 
-void * CudaAllocator::allocate(size_t size) {
+void *CudaAllocator::allocate(size_t size) {
   void *ptr;
   check_CUDA(cudaMalloc(&ptr, size));
   return ptr;
@@ -13,5 +13,3 @@ void CudaAllocator::deallocate(void *ptr) {
 }
 
 } // namespace FlexFlow
-
-
diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h
index 4eea9b4207..cb3c8e1c19 100644
--- a/lib/runtime/src/cuda_allocator.h
+++ b/lib/runtime/src/cuda_allocator.h
@@ -11,7 +11,6 @@ struct CudaAllocator : public IAllocator {
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
-
 };
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/realm_allocator.h b/lib/runtime/src/realm_allocator.h
index 99ba8a8252..95957b643d 100644
--- a/lib/runtime/src/realm_allocator.h
+++ b/lib/runtime/src/realm_allocator.h
@@ -15,9 +15,7 @@ struct RealmAllocator : public IAllocator {
   ~RealmAllocator() override;
 
   void *allocate(size_t) override;
-  void deallocate(void *) override
-private:
-  Legion::Memory memory;
+  void deallocate(void *) override private : Legion::Memory memory;
   stack_vector<Realm::RegionInstance, MAX_INSTANCE_ALLOCATIONS> instances;
 };
 
diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc
index a3a14a0b64..8233735396 100644
--- a/lib/runtime/src/sim_environment.cc
+++ b/lib/runtime/src/sim_environment.cc
@@ -2,25 +2,25 @@
 
 namespaec FlexFlow {
 
-void SimTaskBinding::bind(slot_id id, ParallelTensorShape const & shape) {
+  void SimTaskBinding::bind(slot_id id, ParallelTensorShape const &shape) {
     tensor_shape_bindings.insert(id, shape);
-}
-void SimTaskBinding::bind(slot_id id, TensorShape const & shape) {
+  }
+  void SimTaskBinding::bind(slot_id id, TensorShape const &shape) {
     tensor_shape_bindings.insert(id, shape);
-}
+  }
 
-void SimTaskBinding::bind(slot_id  id, InputVariadicParallelTensorDesc const & desc) {
+  void SimTaskBinding::bind(slot_id id,
+                            InputVariadicParallelTensorDesc const &desc) {
     this->tensor_shape_bindings.insert(id, desc);
-}
+  }
 
-void SimTaskBinding::bind_arg(slot_id id, SimArg const & arg) {
+  void SimTaskBinding::bind_arg(slot_id id, SimArg const &arg) {
     arg_bindings.insert(id, arg);
-}
-
-TaskArgumentAccessor SimEnvironment::get_fwd_accessor(task_id_t tid, SimTaskBinding const & sim_task_binding) {
-    NOT_IMPLEMENTED();//TODO
-}
-
+  }
 
+  TaskArgumentAccessor SimEnvironment::get_fwd_accessor(
+      task_id_t tid, SimTaskBinding const &sim_task_binding) {
+    NOT_IMPLEMENTED(); // TODO
+  }
 
 } // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/runtime/src/sim_environment.h b/lib/runtime/src/sim_environment.h
index 7af7a84125..57b00eda34 100644
--- a/lib/runtime/src/sim_environment.h
+++ b/lib/runtime/src/sim_environment.h
@@ -26,7 +26,9 @@ struct InputVariadicParallelTensorDesc {
 };
 
 using SimArg = variant<ProfilingSettings, AggregateAttrs>;
-using SimTensorSpec = variant<ParallelTensorShape, InputParallelTensorDesc, InputVariadicParallelTensorDesc>;
+using SimTensorSpec = variant<ParallelTensorShape,
+                              InputParallelTensorDesc,
+                              InputVariadicParallelTensorDesc>;
 
 struct SimTaskBinding {
   void bind(slot_id, ParallelTensorShape const &);
@@ -38,8 +40,9 @@ struct SimTaskBinding {
   void bind_untrainable(slot_id, std::vector<ParallelTensorShape> const &);
   void bind(slot_id, std::vector<ParallelTensorShape> const &, IsTrainable);
   void bind(slot_id, InputVariadicParallelTensorDesc const &);
-  
-  template <typename T> void bind_arg(slot_id id, T const & name);
+
+  template <typename T>
+  void bind_arg(slot_id id, T const &name);
 
   std::unordered_map<slot_id, SimArg> arg_bindings;
   std::unordered_map<slot_id, SimTensorSpec> tensor_shape_bindings;
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index b777afb637..55d66b035a 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -2,69 +2,77 @@
 
 using namespace FlexFlow {
 
-template <typename T> T const & LocalTaskArgumentAccessor::get_argument(slot_id slot)  const{
-    if(slot == PROFILING) {
-        return get<ProfilingSettings>(this->arg_bindings.at(slot));
-    } elif (slot == ATTRS) {
-        return get<AggregateAttrs>(this->arg_bindings.at(slot));
-    } else {
-        throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_argument");
+  template <typename T>
+  T const &LocalTaskArgumentAccessor::get_argument(slot_id slot) const {
+    if (slot == PROFILING) {
+      return get<ProfilingSettings>(this->arg_bindings.at(slot));
     }
-}
-
-void * LocalTaskArgumentAccessor::allocate(size_t size) {
-        void * ptr = local_allocator.allocate(memory_size);//Note: how(when) to free this memory? 
-        void * cpu_ptr = malloc(memory_size);
-        memset(cpu_ptr, 0, memory_size);
-        checkCUDA(cudaMemcpy(ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); //fill ptr
-        free(cpu_ptr);
-        return ptr;
-}
+    elif (slot == ATTRS) {
+      return get<AggregateAttrs>(this->arg_bindings.at(slot));
+    }
+    else {
+      throw std::runtime_error(
+          "Unknown Slot ID in LocalTaskArgumentAccessor::get_argument");
+    }
+  }
 
+  void *LocalTaskArgumentAccessor::allocate(size_t size) {
+    void *ptr = local_allocator.allocate(
+        memory_size); // Note: how(when) to free this memory?
+    void *cpu_ptr = malloc(memory_size);
+    memset(cpu_ptr, 0, memory_size);
+    checkCUDA(cudaMemcpy(
+        ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); // fill ptr
+    free(cpu_ptr);
+    return ptr;
+  }
 
-template <Permissions PRIV> privilege_mode_to_accessor<PRIV> LocalTaskArgumentAccessor::get_tensor(slot_id slot) const {
-    SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot);
-    if(slot == GATE_PREDS || slot == GATE_ASSIGN ) {
-        InputParallelTensorDesc  gate_preds = get<InputParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
-        DataType data_type = gate_preds.shape.data_type;
-        ArrayShape array_shape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector<size_t>
-        size_t shape_size =gate_preds.shape.dims.get_volume()* size_of(shape);
-        void * ptr = allocate(shape_size);
-        return gate_preds_accessor{shape, array_shape, ptr};
-    } else if(slot == EXP_PREDS) {
-        InputVariadicParallelTensorDesc exp_preds = get<InputVariadicParallelTensorDesc>(this->sim_task_binding->tensor_shape_bindings.at(slot));
-        //TODO, I should convert exp_preds to privilege_mode_to_accessor<PRIV>
-        NOT_IMPLEMENTED();
-    } else if(slot == OUTPUT) {
-        ParallelTensorShape  output_shape = get<ParallelTensorShape>(this->sim_task_binding->tensor_shape_bindings.at(slot));
-        Datatype data_type = output_shape.data_type;
-        ArrayShape array_shape = {output_shape.dims.get_dims()};//output_shape.dims.get_dims() return std::vector<size_t>
-        size_t shape_size = output_shape.dims.get_volume() * size_of(data_type);
-        void * ptr = allocate(shape_size);
-        return {shape, array_shape, ptr};
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> LocalTaskArgumentAccessor::get_tensor(
+      slot_id slot) const {
+    SimTensorSpec const &spec = this->tensor_shape_bindings.at(slot);
+    if (slot == GATE_PREDS || slot == GATE_ASSIGN) {
+      InputParallelTensorDesc gate_preds = get<InputParallelTensorDesc>(
+          this->sim_task_binding->tensor_shape_bindings.at(slot));
+      DataType data_type = gate_preds.shape.data_type;
+      ArrayShape array_shape = {
+          gate_preds.shape.dims.get_dims()}; // gate_preds.shape.dims.get_dims()
+                                             // return std::vector<size_t>
+      size_t shape_size = gate_preds.shape.dims.get_volume() * size_of(shape);
+      void *ptr = allocate(shape_size);
+      return gate_preds_accessor{shape, array_shape, ptr};
+    } else if (slot == OUTPUT) {
+      ParallelTensorShape output_shape = get<ParallelTensorShape>(
+          this->sim_task_binding->tensor_shape_bindings.at(slot));
+      Datatype data_type = output_shape.data_type;
+      ArrayShape array_shape = {
+          output_shape.dims.get_dims()}; // output_shape.dims.get_dims() return
+                                         // std::vector<size_t>
+      size_t shape_size = output_shape.dims.get_volume() * size_of(data_type);
+      void *ptr = allocate(shape_size);
+      return {shape, array_shape, ptr};
     } else {
-        throw mk_runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
+      throw mk_runtime_error(
+          "Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
     }
-//     NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor<PRIV>
-//lib/runtime/src/accessor.h
-//     template <>
-// struct privilege_mode_to_accessor_t<Permissions::RW> {
-//   using type = GenericTensorAccessorW;
-// };
+  }
 
-// template <>
-// struct privilege_mode_to_accessor_t<Permissions::RO> {
-//   using type = GenericTensorAccessorR;
-// };
-
-// template <>
-// struct privilege_mode_to_accessor_t<Permissions::WO> {
-//   using type = GenericTensorAccessorW;
-// };
-
-// template <Permissions PRIV>
-// using privilege_mode_to_accessor =
-//     typename privilege_mode_to_accessor_t<PRIV>::type;
-}
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>> get_variadic_tensor(
+      slot_id slot) const override {
+    std::vector<privilege_mode_to_accessor<PRIV>> result;
+    InputVariadicParallelTensorDesc const &spec =
+        get<InputVariadicParallelTensorDesc>(
+            this->sim_task_binding->tensor_shape_bindings.at(slot));
+    for (auto const &shape : spec.shapes) {
+      ArrayShape array_shape = {
+          shape.dims
+              .get_dims()}; // shape.dims.get_dims() return std::vector<size_t>
+      size_t shape_size = shape.dims.get_volume() * size_of(shape.data_type);
+      void *ptr = allocate(shape_size);
+      result.push_back({shape, array_shape, ptr});
+    }
+    return result;
+  }
 
-}//namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index 0229ccd50b..d640fc04a9 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -2,12 +2,12 @@
 #define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H
 
 #include "accessor.h"
+#include "kernels/allocation.h"
 #include "runtime/config.h"
 #include "task_invocation.h"
 #include "utils/exception.h"
 #include "utils/stack_map.h"
 #include "utils/strong_typedef.h"
-#include "kernels/allocation.h"
 #include <vector>
 
 namespace FlexFlow {
@@ -71,85 +71,33 @@ region_idx_t get_region_idx(TaskArgumentsFormat const &,
                             parallel_tensor_guid_t const &);
 DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &);
 
-
 struct ITaskArgumentAccessor {
-virtual template <typename T> T const &get_argument(slot_id slot) const = 0;
+  virtual template <typename T>
+  T const &get_argument(slot_id slot) const = 0;
 
-
-virtual template <Permissions PRIV>
+  virtual template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const = 0;
-};
 
-struct LegionTaskArgumentAccessor: public ITaskArgumentAccessor{
+  virtual template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const = 0;
+};
 
+struct LegionTaskArgumentAccessor : public ITaskArgumentAccessor {
 public:
-virtual template <typename T> T const &get_argument(slot_id slot) const override;
-virtual template <Permissions PRIV> privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
-
-//   template <typename T>
-//   T const &get_argument(slot_id slot) const {
-//     TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
-//     std::type_index actual_type = arg_fmt.type;
-//     std::type_index requested_type = {typeid(T)};
-
-//     if (actual_type != requested_type) {
-//       throw mk_runtime_error(
-//           "Type mismatch in argument access (\"{}\" != \"{}\")",
-//           actual_type.name(),
-//           requested_type.name());
-//     }
-
-//     void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
-//     Legion::Deserializer dez(start_ptr, arg_fmt.size());
-
-//     return ff_task_deserialize<T>(dez);
-//   }
-
-LegionTaskArgumentAccessor(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime):task(task), regions(regions), ctx(ctx), runtime(runtime){}
-
-// template <Permissions PRIV> privilege_mode_to_accessor<PRIV>
-//       get_generic_accessor(region_idx_t const &idx) const {
-//     auto tensor_privs = get_permissions(this->args_fmt, idx);
-//     if (tensor_privs != PRIV) {
-//       throw mk_runtime_error(
-//           "Privilege mismatch while accessing tensor: {} != {}",
-//           tensor_privs,
-//           PRIV);
-//     }
-
-//     return helperGetGenericTensorAccessor<PRIV>(
-//         get_datatype(this->args_fmt, idx),
-//         regions[idx.value()],
-//         task->regions[idx.value()],
-//         FID_DATA,
-//         ctx,
-//         runtime);
-//   }
-
-//     template <Permissions PRIV>
-//   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
-//     auto argument_format =
-//         get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
-
-//     return this->get_generic_accessor<PRIV>(argument_format);
-//   }
-
-//   template <Permissions PRIV>
-//   std::vector<privilege_mode_to_accessor<PRIV>>
-//       get_variadic_tensor(slot_id slot) const {
-//     std::vector<privilege_mode_to_accessor<PRIV>> result;
-
-//     auto argument_format =
-//         get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
-//     for (NonvariadicFormat const &argument : argument_format) {
-//       result.push_back(this->get_generic_accessor<PRIV>(argument));
-//     }
-
-//     return result;
-//   }
+  template <typename T>
+  T const &get_argument(slot_id slot) const override;
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const override;
+
+  LegionTaskArgumentAccessor(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime)
+      : task(task), regions(regions), ctx(ctx), runtime(runtime) {}
 
 private:
   Legion::Task const *task;
@@ -159,129 +107,47 @@ LegionTaskArgumentAccessor(Legion::Task const *task,
   TaskArgumentsFormat const &args_fmt;
 };
 
-struct LocalTaskArgumentAccessor: public ITaskArgumentAccessor{
-
+struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
 public:
-virtual template <typename T> T const &get_argument(slot_id slot) const override;
-virtual template <Permissions PRIV> privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
-
-LocalTaskArgumentAccessor(std::shared_ptr<SimTaskBinding const>  & sim_task_binding):sim_task_binding(sim_task_binding){
-  local_allocator = Allocator::create<CudaAllocator>();
-}
-
-void * allocate(size_t size);
+  template <typename T>
+  T const &get_argument(slot_id slot) const override;
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const override;
+
+  LocalTaskArgumentAccessor(
+      std::shared_ptr<SimTaskBinding const> &sim_task_binding)
+      : sim_task_binding(sim_task_binding) {
+    local_allocator = Allocator::create<CudaAllocator>();
+  }
 
-void deallocate(void * ptr);
+  void *allocate(size_t size);
+  void deallocate(void *ptr);
 
 private:
   std::shared_ptr<SimTaskBinding const> sim_task_binding;
- // CudaAllocator local_allocator;
- Allocator local_allocator;
+  Allocator local_allocator;
 };
 
 struct TaskArgumentAccessor {
-  // TaskArgumentAccessor(Legion::Task const *task,
-  //                      std::vector<Legion::PhysicalRegion> const &regions,
-  //                      Legion::Context ctx,
-  //                      Legion::Runtime *runtime);
-    template <typename T>
+  template <typename T>
   T const &get_argument(slot_id slot) const {
     return this->ptr->get_argument<T>(slot);
   }
 
-
-    template <Permissions PRIV>
+  template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
     return this->ptr->get_tensor<PRIV>(slot);
   }
 
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const {
+    return this->ptr->get_variadic_tensor<PRIV>(slot);
+  }
 
-
-  // template <typename T>
-  // T const &get_argument(slot_id slot) const {
-  //   TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
-  //   std::type_index actual_type = arg_fmt.type;
-  //   std::type_index requested_type = {typeid(T)};
-
-  //   if (actual_type != requested_type) {
-  //     throw mk_runtime_error(
-  //         "Type mismatch in argument access (\"{}\" != \"{}\")",
-  //         actual_type.name(),
-  //         requested_type.name());
-  //   }
-
-  //   void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
-  //   Legion::Deserializer dez(start_ptr, arg_fmt.size());
-
-  //   return ff_task_deserialize<T>(dez);
-  // }
-
-  // template <typename T>
-  // optional<T> get_optional_argument(slot_id) const {
-  //   NOT_IMPLEMENTED();
-  // }
-
-  // template <typename T>
-  // std::vector<T> get_variadic_argument(slot_id) const {
-  //   NOT_IMPLEMENTED();
-  // }
-
-  // template <Permissions PRIV>
-  // privilege_mode_to_accessor<PRIV>
-  //     get_generic_accessor(region_idx_t const &idx) const {
-  //   auto tensor_privs = get_permissions(this->args_fmt, idx);
-  //   if (tensor_privs != PRIV) {
-  //     throw mk_runtime_error(
-  //         "Privilege mismatch while accessing tensor: {} != {}",
-  //         tensor_privs,
-  //         PRIV);
-  //   }
-
-  //   return helperGetGenericTensorAccessor<PRIV>(
-  //       get_datatype(this->args_fmt, idx),
-  //       regions[idx.value()],
-  //       task->regions[idx.value()],
-  //       FID_DATA,
-  //       ctx,
-  //       runtime);
-  // }
-
-  // template <Permissions PRIV>
-  // privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
-  //   auto argument_format =
-  //       get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
-
-  //   return this->get_generic_accessor<PRIV>(argument_format);
-  // }
-
-  // template <Permissions PRIV>
-  // privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
-  //   NOT_IMPLEMENTED();
-  // }
-
-  // template <Permissions PRIV>
-  // std::vector<privilege_mode_to_accessor<PRIV>>
-  //     get_variadic_tensor(slot_id slot) const {
-  //   std::vector<privilege_mode_to_accessor<PRIV>> result;
-
-  //   auto argument_format =
-  //       get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
-  //   for (NonvariadicFormat const &argument : argument_format) {
-  //     result.push_back(this->get_generic_accessor<PRIV>(argument));
-  //   }
-
-  //   return result;
-  // }
-
-  // template <Permissions PRIV>
-  // std::vector<privilege_mode_to_accessor<PRIV>>
-  //     get_variadic_tensor_grad(slot_id slot) const {
-  //   NOT_IMPLEMENTED();
-  // }
-
-  // size_t get_device_idx() const {
-  //   NOT_IMPLEMENTED();
-  // }
   template <typename T, typename... Args>
   static
       typename std::enable_if<std::is_base_of<ITaskArgumentAccessor, T>::value,
@@ -290,16 +156,11 @@ struct TaskArgumentAccessor {
     return TaskArgumentAccessor(
         std::make_shared<T>(std::forward<Args>(args)...));
   }
+
 private:
-  // Legion::Task const *task;
-  // std::vector<Legion::PhysicalRegion> const &regions;
-  // Legion::Context ctx;
-  // Legion::Runtime *runtime;
-  // TaskArgumentsFormat const &args_fmt;
-  //cow_ptr_t<IMultiDiGraph> const & ITaskArgumentAccesor; 
-  //cow_ptr_t<ITaskArgumentAccessor> const & ptr;
-  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> & ptr): ptr(ptr) {}
-  std::shared_ptr<ITaskArgumentAccessor const> ptr; 
+  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> &ptr)
+      : ptr(ptr) {}
+  std::shared_ptr<ITaskArgumentAccessor const> ptr;
 };
 
 } // namespace FlexFlow

From 07ef870e670bd6cd3db0528926867ccfa85c6553 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 9 Aug 2023 01:59:28 +0000
Subject: [PATCH 08/13] delete the kernels/src/cuda/accessor.cc

---
 lib/kernels/src/cuda/accessor.cc | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 lib/kernels/src/cuda/accessor.cc

diff --git a/lib/kernels/src/cuda/accessor.cc b/lib/kernels/src/cuda/accessor.cc
deleted file mode 100644
index 1559c10350..0000000000
--- a/lib/kernels/src/cuda/accessor.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "kernels/accessor.h"
-
-namespace FlexFlow {
-
-int32_t *get_int32_ptr(GenericTensorAccessorW const &w) {
-  return static_cast<int32_t *>(
-      w.ptr.value()); // Note(lambda):we use static_cast, may have some problem
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &w) {
-  return static_cast<int64_t *>(w.ptr.value());
-}
-float *get_float_ptr(GenericTensorAccessorW const &w) {
-  return static_cast<float *>(w.ptr.value());
-}
-double *get_double_ptr(GenericTensorAccessorW const &w) {
-  return static_cast<double *>(w.ptr.value());
-}
-half *get_half_ptr(GenericTensorAccessorW const &w) {
-  return static_cast<half *>(w.ptr.value());
-}
-
-} // namespace FlexFlow
\ No newline at end of file

From 445863d0f9875475c94fd0fd6aa791b4f7c73d6a Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 9 Aug 2023 02:07:06 +0000
Subject: [PATCH 09/13] fix the cuda_allocator error

---
 lib/runtime/src/cuda_allocator.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc
index 5a4529c8ed..f60bf513ee 100644
--- a/lib/runtime/src/cuda_allocator.cc
+++ b/lib/runtime/src/cuda_allocator.cc
@@ -1,15 +1,16 @@
 #include "cuda_allocator.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
 void *CudaAllocator::allocate(size_t size) {
   void *ptr;
-  check_CUDA(cudaMalloc(&ptr, size));
+  checkCUDA(cudaMalloc(&ptr, size));
   return ptr;
 }
 
 void CudaAllocator::deallocate(void *ptr) {
-  check_CUDA(cudaFree(ptr));
+  checkCUDA(cudaFree(ptr));
 }
 
 } // namespace FlexFlow

From eaf889d29b951ccfedb47b28d33a588fe3b199dd Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 9 Aug 2023 07:10:56 +0000
Subject: [PATCH 10/13] add API interface for ITaskArgumentAccessor and its
 subclass

---
 .../src/task_spec/task_argument_accessor.h    | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index d640fc04a9..448a92d94b 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -81,18 +81,54 @@ struct ITaskArgumentAccessor {
   virtual template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_tensor(slot_id slot) const = 0;
+
+  virtual template <typename T>
+  optional<T> get_optional_argument(slot_id) const = 0;
+
+  virtual template <typename T>
+  std::vector<T> get_variadic_argument(slot_id) const = 0;
+
+  virtual template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV>
+      get_generic_accessor(region_idx_t const &idx) const = 0;
+
+  virtual template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const = 0;
+
+  virtual template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const = 0;
+
+  virtual size_t get_device_idx() const = 0;
 };
 
 struct LegionTaskArgumentAccessor : public ITaskArgumentAccessor {
 public:
   template <typename T>
   T const &get_argument(slot_id slot) const override;
+
   template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
+
   template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_tensor(slot_id slot) const override;
 
+  template <typename T>
+  optional<T> get_optional_argument(slot_id) const override;
+
+  template <typename T>
+  std::vector<T> get_variadic_argument(slot_id) const override;
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const override;
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const override;
+
+  size_t get_device_idx() const override;
+
   LegionTaskArgumentAccessor(Legion::Task const *task,
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
@@ -111,12 +147,29 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
 public:
   template <typename T>
   T const &get_argument(slot_id slot) const override;
+
   template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
+
   template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_tensor(slot_id slot) const override;
 
+  template <typename T>
+  optional<T> get_optional_argument(slot_id) const override;
+
+  template <typename T>
+  std::vector<T> get_variadic_argument(slot_id) const override;
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const override;
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const override;
+
+  size_t get_device_idx() const override;
+
   LocalTaskArgumentAccessor(
       std::shared_ptr<SimTaskBinding const> &sim_task_binding)
       : sim_task_binding(sim_task_binding) {

From e8a80c636d59a928f955f2c99f7590c3393ddc8d Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Thu, 10 Aug 2023 09:11:37 +0000
Subject: [PATCH 11/13] add memory usage track for LocalTaskArgumentAccessor

---
 .../src/task_spec/task_argument_accessor.cc   | 20 ++++++++++---------
 .../src/task_spec/task_argument_accessor.h    |  7 ++++++-
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index 55d66b035a..c74685d662 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -17,12 +17,13 @@ using namespace FlexFlow {
   }
 
   void *LocalTaskArgumentAccessor::allocate(size_t size) {
-    void *ptr = local_allocator.allocate(
-        memory_size); // Note: how(when) to free this memory?
-    void *cpu_ptr = malloc(memory_size);
-    memset(cpu_ptr, 0, memory_size);
-    checkCUDA(cudaMemcpy(
-        ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); // fill ptr
+    void *ptr =
+        local_allocator.allocate(size); // Note: how(when) to free this memory?
+    void *cpu_ptr = malloc(size);
+    memory_usage += size; // update the usage of memory
+    memset(cpu_ptr, 0, size);
+    checkCUDA(
+        cudaMemcpy(ptr, cpu_ptr, size, cudaMemcpyHostToDevice)); // fill ptr
     free(cpu_ptr);
     return ptr;
   }
@@ -40,7 +41,7 @@ using namespace FlexFlow {
                                              // return std::vector<size_t>
       size_t shape_size = gate_preds.shape.dims.get_volume() * size_of(shape);
       void *ptr = allocate(shape_size);
-      return gate_preds_accessor{shape, array_shape, ptr};
+      return gate_preds_accessor{data_type, array_shape, ptr};
     } else if (slot == OUTPUT) {
       ParallelTensorShape output_shape = get<ParallelTensorShape>(
           this->sim_task_binding->tensor_shape_bindings.at(slot));
@@ -50,7 +51,7 @@ using namespace FlexFlow {
                                          // std::vector<size_t>
       size_t shape_size = output_shape.dims.get_volume() * size_of(data_type);
       void *ptr = allocate(shape_size);
-      return {shape, array_shape, ptr};
+      return {data_type, array_shape, ptr};
     } else {
       throw mk_runtime_error(
           "Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
@@ -70,7 +71,8 @@ using namespace FlexFlow {
               .get_dims()}; // shape.dims.get_dims() return std::vector<size_t>
       size_t shape_size = shape.dims.get_volume() * size_of(shape.data_type);
       void *ptr = allocate(shape_size);
-      result.push_back({shape, array_shape, ptr});
+      DataType data_type = shape.data_type;
+      result.push_back({data_type, array_shape, ptr});
     }
     return result;
   }
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index 448a92d94b..f064aa4b9a 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -172,16 +172,21 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
 
   LocalTaskArgumentAccessor(
       std::shared_ptr<SimTaskBinding const> &sim_task_binding)
-      : sim_task_binding(sim_task_binding) {
+      : sim_task_binding(sim_task_binding), memory_usage(0) {
     local_allocator = Allocator::create<CudaAllocator>();
   }
 
+  size_t get_memory_usage() const {
+    return memory_usage;
+  }
+
   void *allocate(size_t size);
   void deallocate(void *ptr);
 
 private:
   std::shared_ptr<SimTaskBinding const> sim_task_binding;
   Allocator local_allocator;
+  size_t memory_usage;
 };
 
 struct TaskArgumentAccessor {

From f39178221866db84db770aa6a5cd69a66880888b Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Mon, 21 Aug 2023 09:07:11 -0700
Subject: [PATCH 12/13] Pull in formatting code from repo-refactor

---
 .clang-format-for-format-sh              | 182 +++++++++++++++++++++++
 .github/workflows/clang-format-check.yml |  20 ++-
 scripts/format.sh                        |  11 +-
 3 files changed, 202 insertions(+), 11 deletions(-)
 create mode 100644 .clang-format-for-format-sh

diff --git a/.clang-format-for-format-sh b/.clang-format-for-format-sh
new file mode 100644
index 0000000000..17e9f8935d
--- /dev/null
+++ b/.clang-format-for-format-sh
@@ -0,0 +1,182 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+AttributeMacros:
+  - __capability
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseLabels: true
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     2
+IndentWrappedFunctionNames: true
+InsertBraces: true
+InsertTrailingCommas: None
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+PPIndentWidth:   -1
+QualifierAlignment: Right
+ReferenceAlignment: Pointer
+ReflowComments:  true
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - STRINGIZE
+  - PP_STRINGIZE
+  - BOOST_PP_STRINGIZE
+  - NS_SWIFT_NAME
+  - CF_SWIFT_NAME
+...
+
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 46c9bf3be2..fb93fd6b5b 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -5,21 +5,25 @@ jobs:
     name: Formatting Check
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         path:
-          - check: "src"
-            exclude: '\.proto$'
-          - check: "include"
-          - check: "nmt"
-          - check: "python"
-          - check: "scripts"
+          - check: "lib/compiler"
+          - check: "lib/ffi"
+          - check: "lib/kernels"
+          - check: "lib/op-attrs"
+          - check: "lib/pcg"
+          - check: "lib/runtime"
+          - check: "lib/substitutions"
+          - check: "lib/utils"
           - check: "tests"
           - check: "examples"
+          - check: "bindings"
     steps:
       - uses: actions/checkout@v2
       - name: Run clang-format style check for C/C++/Protobuf programs.
-        uses: jidicula/clang-format-action@v4.8.0
+        uses: lockshaw/clang-format-action@v4.11.0-flexflow-3
         with:
-          clang-format-version: "15"
+          clang-format-version: "16"
           check-path: ${{ matrix.path['check'] }}
           exclude-regex: ${{ matrix.path['exclude'] }}
diff --git a/scripts/format.sh b/scripts/format.sh
index 2ed97b8f0a..9610dc2d26 100755
--- a/scripts/format.sh
+++ b/scripts/format.sh
@@ -6,8 +6,8 @@ GIT_ROOT="$(git rev-parse --show-toplevel)"
 cd "$GIT_ROOT"
 
 TOOLS_PATH="$GIT_ROOT/.tools"
-RELEASE="master-1d7ec53d"
-CLANG_FORMAT_VERSION="15"
+RELEASE="master-f4f85437"
+CLANG_FORMAT_VERSION="16"
 CLANG_FORMAT_PATH="$TOOLS_PATH/clang-format-$CLANG_FORMAT_VERSION-$RELEASE"
 
 mkdir -p "$TOOLS_PATH"
@@ -68,5 +68,10 @@ if [[ ! -e $CLANG_FORMAT_PATH ]]; then
   chmod u+x "$CLANG_FORMAT_PATH"
 fi
 
+CLANG_FORMAT_CONFIG="$GIT_ROOT/.clang-format-for-format-sh"
 mapfile -t FILES < <(git ls-files ':!:triton/**' '*.h' '*.cc' '*.cpp' '*.cu' '*.c')
-"$CLANG_FORMAT_PATH" -i "${FILES[@]}"
+if [[ -f $CLANG_FORMAT_CONFIG ]]; then 
+  "$CLANG_FORMAT_PATH" --style=file:"$CLANG_FORMAT_CONFIG" -i "${FILES[@]}"
+else 
+  echo "error"
+fi

From 6469d44e29fad328b873fec9864e193dc11fb354 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Mon, 21 Aug 2023 09:07:37 -0700
Subject: [PATCH 13/13] Format

---
 examples/cpp/DLRM/strategies/dlrm_strategy.cc       | 2 +-
 lib/compiler/include/compiler/machine_mapping.h     | 2 +-
 lib/compiler/test/test_dp.cc                        | 2 +-
 lib/compiler/test/test_open_graph.cc                | 2 +-
 lib/kernels/include/kernels/reshape_kernels.h       | 2 +-
 lib/kernels/src/allocation.cc                       | 2 +-
 lib/kernels/src/hip/aggregate_kernels.cpp           | 2 +-
 lib/kernels/src/hip/optimizer_kernel.cpp            | 2 +-
 lib/op-attrs/src/datatype.cc                        | 2 +-
 lib/runtime/src/cuda_allocator.h                    | 2 +-
 lib/runtime/src/sim_environment.cc                  | 2 +-
 lib/runtime/src/task_spec/task_argument_accessor.cc | 2 +-
 lib/runtime/test/src/test_op_task_spec.cc           | 2 +-
 lib/runtime/test/src/test_serialization.cc          | 2 +-
 lib/utils/include/utils/containers.h                | 2 +-
 lib/utils/include/utils/internal_only_tag.h         | 2 +-
 lib/utils/include/utils/visitable.h                 | 2 +-
 lib/utils/test/src/test_algorithms.cc               | 2 +-
 python/flexflow_c.cc                                | 4 +++-
 19 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy.cc b/examples/cpp/DLRM/strategies/dlrm_strategy.cc
index 2fcc4173c9..a7fab8c3a8 100644
--- a/examples/cpp/DLRM/strategies/dlrm_strategy.cc
+++ b/examples/cpp/DLRM/strategies/dlrm_strategy.cc
@@ -356,4 +356,4 @@ int main(int argc, char **argv) {
   std::to_string(gpu) + "gpus.pb"; std::fstream outputFile(output.c_str(),
   std::ios::out | std::ios::trunc); strategy.SerializeToOstream(&outputFile);
   */
-}
\ No newline at end of file
+}
diff --git a/lib/compiler/include/compiler/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping.h
index c105221682..400e2770f8 100644
--- a/lib/compiler/include/compiler/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping.h
@@ -39,4 +39,4 @@ MachineMapping optimal_cost(
 
 MAKE_VISIT_HASHABLE(::FlexFlow::MachineMapping);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/compiler/test/test_dp.cc b/lib/compiler/test/test_dp.cc
index 18b098a202..01e4189839 100644
--- a/lib/compiler/test/test_dp.cc
+++ b/lib/compiler/test/test_dp.cc
@@ -51,4 +51,4 @@ TEST_CASE("optimal_cost") {
       optimal_cost(pcg, allowed_machine_views, TestCostEstimator{}, resource);
 
   // TODO: check result
-}
\ No newline at end of file
+}
diff --git a/lib/compiler/test/test_open_graph.cc b/lib/compiler/test/test_open_graph.cc
index 6288b481fe..66af736a50 100644
--- a/lib/compiler/test/test_open_graph.cc
+++ b/lib/compiler/test/test_open_graph.cc
@@ -102,4 +102,4 @@ TEST_CASE("get_cut") {
 
   GraphSplit gs1{{ns[0], ns[1], ns[2], ns[3]}, {ns[4]}};
   CHECK(get_cut(g, gs1) == std::unordered_set<MultiDiEdge>{e3, e4});
-}
\ No newline at end of file
+}
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index fa752b6c2b..972f8ee9b2 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -30,4 +30,4 @@ void backward_kernel(ffStream_t stream,
 } // namespace Kernels
 } // namespace FlexFlow
 
-#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
\ No newline at end of file
+#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index eb90b0cb69..fa32a9a705 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -10,4 +10,4 @@ void Allocator::deallocate(void *ptr) {
   i_allocator->deallocate(ptr);
 }
 
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/kernels/src/hip/aggregate_kernels.cpp b/lib/kernels/src/hip/aggregate_kernels.cpp
index ff50a8c7ad..40faaaeeff 100644
--- a/lib/kernels/src/hip/aggregate_kernels.cpp
+++ b/lib/kernels/src/hip/aggregate_kernels.cpp
@@ -296,4 +296,4 @@ AggregatePerDeviceState::~AggregatePerDeviceState(void) {
 
 } // namespace Aggregate
 } // namespace Kernels
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/kernels/src/hip/optimizer_kernel.cpp b/lib/kernels/src/hip/optimizer_kernel.cpp
index 7f57d6a2fb..c22ecd7f5a 100644
--- a/lib/kernels/src/hip/optimizer_kernel.cpp
+++ b/lib/kernels/src/hip/optimizer_kernel.cpp
@@ -245,4 +245,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
 }
 #endif
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc
index fc47ef47eb..6fb2d41051 100644
--- a/lib/op-attrs/src/datatype.cc
+++ b/lib/op-attrs/src/datatype.cc
@@ -21,4 +21,4 @@ size_t size_of(DataType data_type) {
   }
 }
 
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h
index cb3c8e1c19..94f47ba035 100644
--- a/lib/runtime/src/cuda_allocator.h
+++ b/lib/runtime/src/cuda_allocator.h
@@ -15,4 +15,4 @@ struct CudaAllocator : public IAllocator {
 
 } // namespace FlexFlow
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc
index 8233735396..8c599ca2e5 100644
--- a/lib/runtime/src/sim_environment.cc
+++ b/lib/runtime/src/sim_environment.cc
@@ -23,4 +23,4 @@ namespaec FlexFlow {
     NOT_IMPLEMENTED(); // TODO
   }
 
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
index c74685d662..26ad9e3016 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.cc
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -77,4 +77,4 @@ using namespace FlexFlow {
     return result;
   }
 
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/runtime/test/src/test_op_task_spec.cc b/lib/runtime/test/src/test_op_task_spec.cc
index 821ef0dba6..bb0bee567c 100644
--- a/lib/runtime/test/src/test_op_task_spec.cc
+++ b/lib/runtime/test/src/test_op_task_spec.cc
@@ -44,4 +44,4 @@ TEST_CASE("OpTaskBinding") {
   correct_bwd.bind_grad(2, input_tensor(2).grad());
 
   CHECK(correct_bwd == bwd);
-}
\ No newline at end of file
+}
diff --git a/lib/runtime/test/src/test_serialization.cc b/lib/runtime/test/src/test_serialization.cc
index caf5cd7c93..ef18764efb 100644
--- a/lib/runtime/test/src/test_serialization.cc
+++ b/lib/runtime/test/src/test_serialization.cc
@@ -44,4 +44,4 @@ TEST_CASE("Serialization") {
       RC_ASSERT(post_op == pre_op);
     }))
   }
-}
\ No newline at end of file
+}
diff --git a/lib/utils/include/utils/containers.h b/lib/utils/include/utils/containers.h
index df156c9060..236e4b6eac 100644
--- a/lib/utils/include/utils/containers.h
+++ b/lib/utils/include/utils/containers.h
@@ -714,4 +714,4 @@ reversed_container_t<C> reversed_container(C const &c) {
 
 } // namespace FlexFlow
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/utils/include/utils/internal_only_tag.h b/lib/utils/include/utils/internal_only_tag.h
index 649ce4cf12..1e5f8571d0 100644
--- a/lib/utils/include/utils/internal_only_tag.h
+++ b/lib/utils/include/utils/internal_only_tag.h
@@ -7,4 +7,4 @@ struct should_only_be_used_internally_tag_t {
 };
 } // namespace FlexFlow
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/utils/include/utils/visitable.h b/lib/utils/include/utils/visitable.h
index 4f0dc50cbe..6a671400cd 100644
--- a/lib/utils/include/utils/visitable.h
+++ b/lib/utils/include/utils/visitable.h
@@ -457,4 +457,4 @@ struct Arbitrary<
                            _GET_VISITABLE_CASE_FROM_NUM_ARGS(__VA_ARGS__),     \
                            __VA_ARGS__)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/utils/test/src/test_algorithms.cc b/lib/utils/test/src/test_algorithms.cc
index 35534f5b3a..7748bd3ff0 100644
--- a/lib/utils/test/src/test_algorithms.cc
+++ b/lib/utils/test/src/test_algorithms.cc
@@ -220,4 +220,4 @@ TEST_CASE("get_weakly_connected_components") {
   };
 
   CHECK(get_weakly_connected_components(g) == expected_components);
-}
\ No newline at end of file
+}
diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc
index 4c16dd82e9..1d3d77afe5 100644
--- a/python/flexflow_c.cc
+++ b/python/flexflow_c.cc
@@ -33,7 +33,9 @@ class FFCObjectWrapper {
     t_.impl = const_cast<void *>(static_cast<void const *>(t));                \
     return t_;                                                                 \
   }                                                                            \
-  static T unwrap(T_ t_) { return static_cast<T>(t_.impl); }                   \
+  static T unwrap(T_ t_) {                                                     \
+    return static_cast<T>(t_.impl);                                            \
+  }                                                                            \
   static const T unwrap_const(const T_ t_) {                                   \
     return static_cast<const T>(t_.impl);                                      \
   }