diff --git a/.clang-format-for-format-sh b/.clang-format-for-format-sh
new file mode 100644
index 0000000000..17e9f8935d
--- /dev/null
+++ b/.clang-format-for-format-sh
@@ -0,0 +1,182 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+AttributeMacros:
+  - __capability
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseLabels: true
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     2
+IndentWrappedFunctionNames: true
+InsertBraces: true
+InsertTrailingCommas: None
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+PPIndentWidth:   -1
+QualifierAlignment: Right
+ReferenceAlignment: Pointer
+ReflowComments:  true
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - STRINGIZE
+  - PP_STRINGIZE
+  - BOOST_PP_STRINGIZE
+  - NS_SWIFT_NAME
+  - CF_SWIFT_NAME
+...
+
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 46c9bf3be2..fb93fd6b5b 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -5,21 +5,25 @@ jobs:
     name: Formatting Check
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         path:
-          - check: "src"
-            exclude: '\.proto$'
-          - check: "include"
-          - check: "nmt"
-          - check: "python"
-          - check: "scripts"
+          - check: "lib/compiler"
+          - check: "lib/ffi"
+          - check: "lib/kernels"
+          - check: "lib/op-attrs"
+          - check: "lib/pcg"
+          - check: "lib/runtime"
+          - check: "lib/substitutions"
+          - check: "lib/utils"
           - check: "tests"
           - check: "examples"
+          - check: "bindings"
     steps:
       - uses: actions/checkout@v2
       - name: Run clang-format style check for C/C++/Protobuf programs.
-        uses: jidicula/clang-format-action@v4.8.0
+        uses: lockshaw/clang-format-action@v4.11.0-flexflow-3
         with:
-          clang-format-version: "15"
+          clang-format-version: "16"
           check-path: ${{ matrix.path['check'] }}
           exclude-regex: ${{ matrix.path['exclude'] }}
diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy.cc b/examples/cpp/DLRM/strategies/dlrm_strategy.cc
index 2fcc4173c9..a7fab8c3a8 100644
--- a/examples/cpp/DLRM/strategies/dlrm_strategy.cc
+++ b/examples/cpp/DLRM/strategies/dlrm_strategy.cc
@@ -356,4 +356,4 @@ int main(int argc, char **argv) {
   std::to_string(gpu) + "gpus.pb"; std::fstream outputFile(output.c_str(),
   std::ios::out | std::ios::trunc); strategy.SerializeToOstream(&outputFile);
   */
-}
\ No newline at end of file
+}
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index f9ad92a147..ece639cb6a 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,8 +1,8 @@
 #add_subdirectory(pcg)
 #add_subdirectory(compiler)
-# add_subdirectory(runtime)
-#add_subdirectory(op-attrs)
-#add_subdirectory(kernels)
+add_subdirectory(runtime)
+add_subdirectory(op-attrs)
+add_subdirectory(kernels)
 add_subdirectory(utils)
 # add_subdirectory(ffi)
 #add_subdirectory(substitutions)
diff --git a/lib/compiler/include/compiler/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping.h
index c105221682..400e2770f8 100644
--- a/lib/compiler/include/compiler/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping.h
@@ -39,4 +39,4 @@ MachineMapping optimal_cost(
 
 MAKE_VISIT_HASHABLE(::FlexFlow::MachineMapping);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/compiler/test/test_dp.cc b/lib/compiler/test/test_dp.cc
index 18b098a202..01e4189839 100644
--- a/lib/compiler/test/test_dp.cc
+++ b/lib/compiler/test/test_dp.cc
@@ -51,4 +51,4 @@ TEST_CASE("optimal_cost") {
       optimal_cost(pcg, allowed_machine_views, TestCostEstimator{}, resource);
 
   // TODO: check result
-}
\ No newline at end of file
+}
diff --git a/lib/compiler/test/test_open_graph.cc b/lib/compiler/test/test_open_graph.cc
index 6288b481fe..66af736a50 100644
--- a/lib/compiler/test/test_open_graph.cc
+++ b/lib/compiler/test/test_open_graph.cc
@@ -102,4 +102,4 @@ TEST_CASE("get_cut") {
 
   GraphSplit gs1{{ns[0], ns[1], ns[2], ns[3]}, {ns[4]}};
   CHECK(get_cut(g, gs1) == std::unordered_set<MultiDiEdge>{e3, e4});
-}
\ No newline at end of file
+}
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index fa752b6c2b..972f8ee9b2 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -30,4 +30,4 @@ void backward_kernel(ffStream_t stream,
 } // namespace Kernels
 } // namespace FlexFlow
 
-#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
\ No newline at end of file
+#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
new file mode 100644
index 0000000000..fa32a9a705
--- /dev/null
+++ b/lib/kernels/src/allocation.cc
@@ -0,0 +1,13 @@
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+void *Allocator::allocate(size_t size) {
+  return i_allocator->allocate(size);
+}
+
+void Allocator::deallocate(void *ptr) {
+  i_allocator->deallocate(ptr);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 44507c14c4..67c3de54dd 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -10,4 +10,6 @@ std::size_t ArrayShape::get_volume() const {
   return product(this->dims);
 }
 
+ArrayShape::ArrayShape(std::vector<std::size_t> const &_dims) : dims(_dims) {}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/hip/aggregate_kernels.cpp b/lib/kernels/src/hip/aggregate_kernels.cpp
index ff50a8c7ad..40faaaeeff 100644
--- a/lib/kernels/src/hip/aggregate_kernels.cpp
+++ b/lib/kernels/src/hip/aggregate_kernels.cpp
@@ -296,4 +296,4 @@ AggregatePerDeviceState::~AggregatePerDeviceState(void) {
 
 } // namespace Aggregate
 } // namespace Kernels
-} // namespace FlexFlow
\ No newline at end of file
+} // namespace FlexFlow
diff --git a/lib/kernels/src/hip/optimizer_kernel.cpp b/lib/kernels/src/hip/optimizer_kernel.cpp
index 7f57d6a2fb..c22ecd7f5a 100644
--- a/lib/kernels/src/hip/optimizer_kernel.cpp
+++ b/lib/kernels/src/hip/optimizer_kernel.cpp
@@ -245,4 +245,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
 }
 #endif
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index d38ba75232..aeec0c6e01 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -11,6 +11,7 @@ struct ParallelTensorDims : public use_visitable_cmp<ParallelTensorDims> {
 
   size_t get_volume() const;
   size_t num_dims() const;
+  std::vector<size_t> get_dims() const;
 
   using iterator = typename FFOrdered<ParallelDim>::iterator;
   using const_iterator = typename FFOrdered<ParallelDim>::const_iterator;
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index fd560352bb..182b2169c3 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -26,7 +26,6 @@ struct ParallelTensorShape : public use_visitable_cmp<ParallelTensorShape> {
   ParallelTensorShape(TensorShape const &);
 
   int num_dims() const;
-
   ParallelDim const &at(ff_dim_t const &) const;
   ParallelDim &at(ff_dim_t const &);
   ParallelDim const &operator[](ff_dim_t const &) const;
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index fa34860817..ab0f958b42 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -16,7 +16,7 @@ struct TensorShape : public use_visitable_cmp<TensorShape> {
 
   template <typename Dims>
   TensorShape(Dims const &dims, DataType data_type)
-      : dims(dims), data_type(data_type) {}
+      : dims(this->dims), data_type(this->data_type) {}
 
   size_t at(ff_dim_t) const;
   size_t operator[](ff_dim_t) const;
diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc
new file mode 100644
index 0000000000..6fb2d41051
--- /dev/null
+++ b/lib/op-attrs/src/datatype.cc
@@ -0,0 +1,24 @@
+#include "op-attrs/datatype.h"
+
+namespace FlexFlow {
+
+size_t size_of(DataType data_type) {
+  switch (data_type) {
+    case DataType::BOOL:
+      return sizeof(bool);
+    case DataType::INT32:
+      return sizeof(int32_t);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::HALF:
+      return sizeof(float) / 2;
+    case DataType::FLOAT:
+      return sizeof(float);
+    case DataType::DOUBLE:
+      return sizeof(double);
+    default:
+      throw mk_runtime_error("Unknown data type");
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc
index 9a36e7d11b..0f4121973f 100644
--- a/lib/op-attrs/src/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/parallel_tensor_shape.cc
@@ -16,6 +16,24 @@ static std::vector<ParallelDim> lift_dims(TensorDims const &dims) {
 ParallelTensorDims::ParallelTensorDims(TensorDims const &dims)
     : data(lift_dims(dims)) {}
 
+std::vector<size_t> ParallelTensorDims::get_dims() const {
+  std::vector<size_t> dims;
+  for (ParallelDim const &d : this->data) {
+    dims.push_back(d.size);
+  }
+  return dims;
+}
+
+size_t ParallelTensorDims::get_volume() const {
+
+  // this function can use contains.h to optimize the code
+  size_t volume = 1;
+  for (ParallelDim const &d : this->data) {
+    volume *= d.size;
+  }
+  return volume;
+}
+
 ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape)
     : dims(tensor_shape.dims), data_type(tensor_shape.data_type) {}
 
@@ -23,6 +41,10 @@ int get_num_replica_dims(ParallelTensorShape const &shape) {
   return count(shape.dims, is_replica_dim);
 }
 
+TensorShape get_piece_shape(ParallelTensorShape const &parall_tensor_shape) {
+  return TensorShape(parall_tensor_shape.dims, parall_tensor_shape.data_type);
+}
+
 int get_num_replicas(ParallelTensorShape const &shape) {
   return product(
       transform(filter(as_vector(shape.dims), is_replica_dim),
diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc
new file mode 100644
index 0000000000..f60bf513ee
--- /dev/null
+++ b/lib/runtime/src/cuda_allocator.cc
@@ -0,0 +1,16 @@
+#include "cuda_allocator.h"
+#include "kernels/device.h"
+
+namespace FlexFlow {
+
+void *CudaAllocator::allocate(size_t size) {
+  void *ptr;
+  checkCUDA(cudaMalloc(&ptr, size));
+  return ptr;
+}
+
+void CudaAllocator::deallocate(void *ptr) {
+  checkCUDA(cudaFree(ptr));
+}
+
+} // namespace FlexFlow
diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h
new file mode 100644
index 0000000000..94f47ba035
--- /dev/null
+++ b/lib/runtime/src/cuda_allocator.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_RUNTIME_CUDA_ALLOCATOR_H
+#define _FLEXFLOW_RUNTIME_CUDA_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+#include <cuda_runtime.h>
+
+namespace FlexFlow {
+
+struct CudaAllocator : public IAllocator {
+  ~CudaAllocator() override;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/runtime/src/ops/aggregate.cc b/lib/runtime/src/ops/aggregate.cc
index e9874c877a..c1136f513e 100644
--- a/lib/runtime/src/ops/aggregate.cc
+++ b/lib/runtime/src/ops/aggregate.cc
@@ -467,6 +467,8 @@ CostMetrics
 
   fwd_binding.bind_arg(PROFILING, settings);
 
+  fwd_binding.bind_arg(ATTRS, attrs);
+
   auto fwd_accessor = env.get_fwd_accessor(AGGREGATE_FWD_TASK_ID, fwd_binding);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
diff --git a/lib/runtime/src/realm_allocator.h b/lib/runtime/src/realm_allocator.h
index 210fd8a050..95957b643d 100644
--- a/lib/runtime/src/realm_allocator.h
+++ b/lib/runtime/src/realm_allocator.h
@@ -15,10 +15,7 @@ struct RealmAllocator : public IAllocator {
   ~RealmAllocator() override;
 
   void *allocate(size_t) override;
-  void deallocate(void *) override;
-
-private:
-  Legion::Memory memory;
+  void deallocate(void *) override private : Legion::Memory memory;
   stack_vector<Realm::RegionInstance, MAX_INSTANCE_ALLOCATIONS> instances;
 };
 
diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc
new file mode 100644
index 0000000000..8c599ca2e5
--- /dev/null
+++ b/lib/runtime/src/sim_environment.cc
@@ -0,0 +1,26 @@
+#include "sim_environment.h"
+
+namespaec FlexFlow {
+
+  void SimTaskBinding::bind(slot_id id, ParallelTensorShape const &shape) {
+    tensor_shape_bindings.insert(id, shape);
+  }
+  void SimTaskBinding::bind(slot_id id, TensorShape const &shape) {
+    tensor_shape_bindings.insert(id, shape);
+  }
+
+  void SimTaskBinding::bind(slot_id id,
+                            InputVariadicParallelTensorDesc const &desc) {
+    this->tensor_shape_bindings.insert(id, desc);
+  }
+
+  void SimTaskBinding::bind_arg(slot_id id, SimArg const &arg) {
+    arg_bindings.insert(id, arg);
+  }
+
+  TaskArgumentAccessor SimEnvironment::get_fwd_accessor(
+      task_id_t tid, SimTaskBinding const &sim_task_binding) {
+    NOT_IMPLEMENTED(); // TODO
+  }
+
+} // namespace FlexFlow
diff --git a/lib/runtime/src/sim_environment.h b/lib/runtime/src/sim_environment.h
index d08bef653f..57b00eda34 100644
--- a/lib/runtime/src/sim_environment.h
+++ b/lib/runtime/src/sim_environment.h
@@ -4,9 +4,11 @@
 #include "cost_metrics.h"
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
+#include "kernels/profiling.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "task_spec/op_task_invocation.h"
 #include "task_spec/task_argument_accessor.h"
+#include <unordered_map>
 #include <vector>
 
 namespace FlexFlow {
@@ -23,6 +25,11 @@ struct InputVariadicParallelTensorDesc {
   IsTrainable trainable;
 };
 
+using SimArg = variant<ProfilingSettings, AggregateAttrs>;
+using SimTensorSpec = variant<ParallelTensorShape,
+                              InputParallelTensorDesc,
+                              InputVariadicParallelTensorDesc>;
+
 struct SimTaskBinding {
   void bind(slot_id, ParallelTensorShape const &);
   void bind_untrainable(slot_id, ParallelTensorShape const &);
@@ -35,7 +42,10 @@ struct SimTaskBinding {
   void bind(slot_id, InputVariadicParallelTensorDesc const &);
 
   template <typename T>
-  void bind_arg(slot_id, T const &);
+  void bind_arg(slot_id id, T const &name);
+
+  std::unordered_map<slot_id, SimArg> arg_bindings;
+  std::unordered_map<slot_id, SimTensorSpec> tensor_shape_bindings;
 };
 
 SimTaskBinding infer_bwd_binding(SimTaskBinding const &);
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc
new file mode 100644
index 0000000000..26ad9e3016
--- /dev/null
+++ b/lib/runtime/src/task_spec/task_argument_accessor.cc
@@ -0,0 +1,80 @@
+#include "task_argument_accessor.h"
+
+using namespace FlexFlow {
+
+  template <typename T>
+  T const &LocalTaskArgumentAccessor::get_argument(slot_id slot) const {
+    if (slot == PROFILING) {
+      return get<ProfilingSettings>(this->arg_bindings.at(slot));
+    }
+    elif (slot == ATTRS) {
+      return get<AggregateAttrs>(this->arg_bindings.at(slot));
+    }
+    else {
+      throw std::runtime_error(
+          "Unknown Slot ID in LocalTaskArgumentAccessor::get_argument");
+    }
+  }
+
+  void *LocalTaskArgumentAccessor::allocate(size_t size) {
+    void *ptr =
+        local_allocator.allocate(size); // Note: how(when) to free this memory?
+    void *cpu_ptr = malloc(size);
+    memory_usage += size; // update the usage of memory
+    memset(cpu_ptr, 0, size);
+    checkCUDA(
+        cudaMemcpy(ptr, cpu_ptr, size, cudaMemcpyHostToDevice)); // fill ptr
+    free(cpu_ptr);
+    return ptr;
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> LocalTaskArgumentAccessor::get_tensor(
+      slot_id slot) const {
+    SimTensorSpec const &spec = this->tensor_shape_bindings.at(slot);
+    if (slot == GATE_PREDS || slot == GATE_ASSIGN) {
+      InputParallelTensorDesc gate_preds = get<InputParallelTensorDesc>(
+          this->sim_task_binding->tensor_shape_bindings.at(slot));
+      DataType data_type = gate_preds.shape.data_type;
+      ArrayShape array_shape = {
+          gate_preds.shape.dims.get_dims()}; // gate_preds.shape.dims.get_dims()
+                                             // return std::vector<size_t>
+      size_t shape_size = gate_preds.shape.dims.get_volume() * size_of(shape);
+      void *ptr = allocate(shape_size);
+      return gate_preds_accessor{data_type, array_shape, ptr};
+    } else if (slot == OUTPUT) {
+      ParallelTensorShape output_shape = get<ParallelTensorShape>(
+          this->sim_task_binding->tensor_shape_bindings.at(slot));
+      Datatype data_type = output_shape.data_type;
+      ArrayShape array_shape = {
+          output_shape.dims.get_dims()}; // output_shape.dims.get_dims() return
+                                         // std::vector<size_t>
+      size_t shape_size = output_shape.dims.get_volume() * size_of(data_type);
+      void *ptr = allocate(shape_size);
+      return {data_type, array_shape, ptr};
+    } else {
+      throw mk_runtime_error(
+          "Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor");
+    }
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>> get_variadic_tensor(
+      slot_id slot) const override {
+    std::vector<privilege_mode_to_accessor<PRIV>> result;
+    InputVariadicParallelTensorDesc const &spec =
+        get<InputVariadicParallelTensorDesc>(
+            this->sim_task_binding->tensor_shape_bindings.at(slot));
+    for (auto const &shape : spec.shapes) {
+      ArrayShape array_shape = {
+          shape.dims
+              .get_dims()}; // shape.dims.get_dims() return std::vector<size_t>
+      size_t shape_size = shape.dims.get_volume() * size_of(shape.data_type);
+      void *ptr = allocate(shape_size);
+      DataType data_type = shape.data_type;
+      result.push_back({data_type, array_shape, ptr});
+    }
+    return result;
+  }
+
+} // namespace FlexFlow
diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h
index 4a4cf64512..f064aa4b9a 100644
--- a/lib/runtime/src/task_spec/task_argument_accessor.h
+++ b/lib/runtime/src/task_spec/task_argument_accessor.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H
 
 #include "accessor.h"
+#include "kernels/allocation.h"
 #include "runtime/config.h"
 #include "task_invocation.h"
 #include "utils/exception.h"
@@ -70,104 +71,154 @@ region_idx_t get_region_idx(TaskArgumentsFormat const &,
                             parallel_tensor_guid_t const &);
 DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &);
 
-struct TaskArgumentAccessor {
-  TaskArgumentAccessor(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime);
+struct ITaskArgumentAccessor {
+  virtual template <typename T>
+  T const &get_argument(slot_id slot) const = 0;
+
+  virtual template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const = 0;
+
+  virtual template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const = 0;
+
+  virtual template <typename T>
+  optional<T> get_optional_argument(slot_id) const = 0;
+
+  virtual template <typename T>
+  std::vector<T> get_variadic_argument(slot_id) const = 0;
+
+  virtual template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV>
+      get_generic_accessor(region_idx_t const &idx) const = 0;
+
+  virtual template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const = 0;
 
+  virtual template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const = 0;
+
+  virtual size_t get_device_idx() const = 0;
+};
+
+struct LegionTaskArgumentAccessor : public ITaskArgumentAccessor {
+public:
   template <typename T>
-  T const &get_argument(slot_id slot) const {
-    TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot);
-    std::type_index actual_type = arg_fmt.type;
-    std::type_index requested_type = {typeid(T)};
+  T const &get_argument(slot_id slot) const override;
 
-    if (actual_type != requested_type) {
-      throw mk_runtime_error(
-          "Type mismatch in argument access (\"{}\" != \"{}\")",
-          actual_type.name(),
-          requested_type.name());
-    }
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
 
-    void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start];
-    Legion::Deserializer dez(start_ptr, arg_fmt.size());
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const override;
 
-    return ff_task_deserialize<T>(dez);
-  }
+  template <typename T>
+  optional<T> get_optional_argument(slot_id) const override;
 
   template <typename T>
-  optional<T> get_optional_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
+  std::vector<T> get_variadic_argument(slot_id) const override;
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const override;
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor_grad(slot_id slot) const override;
+
+  size_t get_device_idx() const override;
 
+  LegionTaskArgumentAccessor(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime)
+      : task(task), regions(regions), ctx(ctx), runtime(runtime) {}
+
+private:
+  Legion::Task const *task;
+  std::vector<Legion::PhysicalRegion> const &regions;
+  Legion::Context ctx;
+  Legion::Runtime *runtime;
+  TaskArgumentsFormat const &args_fmt;
+};
+
+struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
+public:
   template <typename T>
-  std::vector<T> get_variadic_argument(slot_id) const {
-    NOT_IMPLEMENTED();
-  }
+  T const &get_argument(slot_id slot) const override;
 
   template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV>
-      get_generic_accessor(region_idx_t const &idx) const {
-    auto tensor_privs = get_permissions(this->args_fmt, idx);
-    if (tensor_privs != PRIV) {
-      throw mk_runtime_error(
-          "Privilege mismatch while accessing tensor: {} != {}",
-          tensor_privs,
-          PRIV);
-    }
-
-    return helperGetGenericTensorAccessor<PRIV>(
-        get_datatype(this->args_fmt, idx),
-        regions[idx.value()],
-        task->regions[idx.value()],
-        FID_DATA,
-        ctx,
-        runtime);
-  }
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const override;
 
   template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
-    auto argument_format =
-        get<NonvariadicFormat>(this->args_fmt.region_idxs.at(slot));
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_tensor(slot_id slot) const override;
 
-    return this->get_generic_accessor<PRIV>(argument_format);
-  }
+  template <typename T>
+  optional<T> get_optional_argument(slot_id) const override;
+
+  template <typename T>
+  std::vector<T> get_variadic_argument(slot_id) const override;
 
   template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
-  }
+  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id slot) const override;
 
   template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(slot_id slot) const {
-    std::vector<privilege_mode_to_accessor<PRIV>> result;
+      get_variadic_tensor_grad(slot_id slot) const override;
+
+  size_t get_device_idx() const override;
 
-    auto argument_format =
-        get<VariadicFormat>(this->args_fmt.region_idxs.at(slot));
-    for (NonvariadicFormat const &argument : argument_format) {
-      result.push_back(this->get_generic_accessor<PRIV>(argument));
-    }
+  LocalTaskArgumentAccessor(
+      std::shared_ptr<SimTaskBinding const> &sim_task_binding)
+      : sim_task_binding(sim_task_binding), memory_usage(0) {
+    local_allocator = Allocator::create<CudaAllocator>();
+  }
 
-    return result;
+  size_t get_memory_usage() const {
+    return memory_usage;
+  }
+
+  void *allocate(size_t size);
+  void deallocate(void *ptr);
+
+private:
+  std::shared_ptr<SimTaskBinding const> sim_task_binding;
+  Allocator local_allocator;
+  size_t memory_usage;
+};
+
+struct TaskArgumentAccessor {
+  template <typename T>
+  T const &get_argument(slot_id slot) const {
+    return this->ptr->get_argument<T>(slot);
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_tensor(slot_id slot) const {
+    return this->ptr->get_tensor<PRIV>(slot);
   }
 
   template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(slot_id slot) const {
-    NOT_IMPLEMENTED();
+      get_variadic_tensor(slot_id slot) const {
+    return this->ptr->get_variadic_tensor<PRIV>(slot);
   }
 
-  size_t get_device_idx() const {
-    NOT_IMPLEMENTED();
+  template <typename T, typename... Args>
+  static
+      typename std::enable_if<std::is_base_of<ITaskArgumentAccessor, T>::value,
+                              TaskArgumentAccessor>::type
+      create(Args &&...args) {
+    return TaskArgumentAccessor(
+        std::make_shared<T>(std::forward<Args>(args)...));
   }
 
 private:
-  Legion::Task const *task;
-  std::vector<Legion::PhysicalRegion> const &regions;
-  Legion::Context ctx;
-  Legion::Runtime *runtime;
-  TaskArgumentsFormat const &args_fmt;
+  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> &ptr)
+      : ptr(ptr) {}
+  std::shared_ptr<ITaskArgumentAccessor const> ptr;
 };
 
 } // namespace FlexFlow
diff --git a/lib/runtime/test/src/test_op_task_spec.cc b/lib/runtime/test/src/test_op_task_spec.cc
index 821ef0dba6..bb0bee567c 100644
--- a/lib/runtime/test/src/test_op_task_spec.cc
+++ b/lib/runtime/test/src/test_op_task_spec.cc
@@ -44,4 +44,4 @@ TEST_CASE("OpTaskBinding") {
   correct_bwd.bind_grad(2, input_tensor(2).grad());
 
   CHECK(correct_bwd == bwd);
-}
\ No newline at end of file
+}
diff --git a/lib/runtime/test/src/test_serialization.cc b/lib/runtime/test/src/test_serialization.cc
index caf5cd7c93..ef18764efb 100644
--- a/lib/runtime/test/src/test_serialization.cc
+++ b/lib/runtime/test/src/test_serialization.cc
@@ -44,4 +44,4 @@ TEST_CASE("Serialization") {
       RC_ASSERT(post_op == pre_op);
     }))
   }
-}
\ No newline at end of file
+}
diff --git a/lib/utils/include/utils/containers.h b/lib/utils/include/utils/containers.h
index df156c9060..236e4b6eac 100644
--- a/lib/utils/include/utils/containers.h
+++ b/lib/utils/include/utils/containers.h
@@ -714,4 +714,4 @@ reversed_container_t<C> reversed_container(C const &c) {
 
 } // namespace FlexFlow
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/utils/include/utils/internal_only_tag.h b/lib/utils/include/utils/internal_only_tag.h
index 649ce4cf12..1e5f8571d0 100644
--- a/lib/utils/include/utils/internal_only_tag.h
+++ b/lib/utils/include/utils/internal_only_tag.h
@@ -7,4 +7,4 @@ struct should_only_be_used_internally_tag_t {
 };
 } // namespace FlexFlow
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/utils/include/utils/visitable.h b/lib/utils/include/utils/visitable.h
index 4f0dc50cbe..6a671400cd 100644
--- a/lib/utils/include/utils/visitable.h
+++ b/lib/utils/include/utils/visitable.h
@@ -457,4 +457,4 @@ struct Arbitrary<
                            _GET_VISITABLE_CASE_FROM_NUM_ARGS(__VA_ARGS__),     \
                            __VA_ARGS__)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/utils/test/src/test_algorithms.cc b/lib/utils/test/src/test_algorithms.cc
index 35534f5b3a..7748bd3ff0 100644
--- a/lib/utils/test/src/test_algorithms.cc
+++ b/lib/utils/test/src/test_algorithms.cc
@@ -220,4 +220,4 @@ TEST_CASE("get_weakly_connected_components") {
   };
 
   CHECK(get_weakly_connected_components(g) == expected_components);
-}
\ No newline at end of file
+}
diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc
index 4c16dd82e9..1d3d77afe5 100644
--- a/python/flexflow_c.cc
+++ b/python/flexflow_c.cc
@@ -33,7 +33,9 @@ class FFCObjectWrapper {
     t_.impl = const_cast<void *>(static_cast<void const *>(t));                \
     return t_;                                                                 \
   }                                                                            \
-  static T unwrap(T_ t_) { return static_cast<T>(t_.impl); }                   \
+  static T unwrap(T_ t_) {                                                     \
+    return static_cast<T>(t_.impl);                                            \
+  }                                                                            \
   static const T unwrap_const(const T_ t_) {                                   \
     return static_cast<const T>(t_.impl);                                      \
   }
diff --git a/scripts/format.sh b/scripts/format.sh
index 2ed97b8f0a..9610dc2d26 100755
--- a/scripts/format.sh
+++ b/scripts/format.sh
@@ -6,8 +6,8 @@ GIT_ROOT="$(git rev-parse --show-toplevel)"
 cd "$GIT_ROOT"
 
 TOOLS_PATH="$GIT_ROOT/.tools"
-RELEASE="master-1d7ec53d"
-CLANG_FORMAT_VERSION="15"
+RELEASE="master-f4f85437"
+CLANG_FORMAT_VERSION="16"
 CLANG_FORMAT_PATH="$TOOLS_PATH/clang-format-$CLANG_FORMAT_VERSION-$RELEASE"
 
 mkdir -p "$TOOLS_PATH"
@@ -68,5 +68,10 @@ if [[ ! -e $CLANG_FORMAT_PATH ]]; then
   chmod u+x "$CLANG_FORMAT_PATH"
 fi
 
+CLANG_FORMAT_CONFIG="$GIT_ROOT/.clang-format-for-format-sh"
 mapfile -t FILES < <(git ls-files ':!:triton/**' '*.h' '*.cc' '*.cpp' '*.cu' '*.c')
-"$CLANG_FORMAT_PATH" -i "${FILES[@]}"
+if [[ -f $CLANG_FORMAT_CONFIG ]]; then 
+  "$CLANG_FORMAT_PATH" --style=file:"$CLANG_FORMAT_CONFIG" -i "${FILES[@]}"
+else 
+  echo "error"
+fi