From dbfc5472072e91d68721f3a100c060b4d337ab78 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 19:14:49 +0000
Subject: [PATCH 1/9] init draft and leave some comment

---
 deps/fmt                                      |  2 +-
 .../include/kernels/layer_norm_kernels.h      | 37 +++++++++-
 lib/kernels/src/cuda/layer_norm_kernels.cu    | 37 +++++++++-
 lib/runtime/src/ops/layer_norm.cc             | 69 +++++++++++++++++++
 4 files changed, 140 insertions(+), 5 deletions(-)

diff --git a/deps/fmt b/deps/fmt
index f5e54359df..a33701196a 160000
--- a/deps/fmt
+++ b/deps/fmt
@@ -1 +1 @@
-Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
+Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index a49e1b3483..cd07a6878c 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 
 #include "kernels/device.h"
+#include "kernels/allocation.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -23,18 +25,49 @@ class LayerNormPerDeviceState : public PerDeviceOpState {
   DataType data_type;
 };
 
+struct LayerNormPerDeviceState {
+    bool elementwise_affine;
+    int64_t effective_batch_size, effective_num_elements;
+    float eps;
+    float *mean, *rstd, *ds, *db, *scale, *bias;
+    DataType data_type;
+};
+
+FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
+                                             elementwise_affine,
+                                             effective_batch_size,
+                                             effective_num_elements,
+                                             eps,
+                                             mean,
+                                             rstd,
+                                             ds,
+                                             db,
+                                             scale,
+                                             bias,
+                                             data_type);
+
+
 namespace Kernels {
 namespace LayerNorm {
 
+//todo: this may have some problem.
+LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &,
+                              Allocator const &,
+                              bool elementwise_affine,
+                              int64_t effective_batch_size,
+                              int64_t effective_num_elements,
+                              float eps);
+
+
 void forward_kernel(ffStream_t stream,
-                    LayerNormPerDeviceState const *m,
+                    LayerNormPerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
                     GenericTensorAccessorW const &gamma,
                     GenericTensorAccessorW const &beta);
 
 void backward_kernel(ffStream_t stream,
-                     LayerNormPerDeviceState const *m,
+                     LayerNormPerDeviceState const &m,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &input_grad,
diff --git a/lib/kernels/src/cuda/layer_norm_kernels.cu b/lib/kernels/src/cuda/layer_norm_kernels.cu
index 65d33bec5e..7dc447d511 100644
--- a/lib/kernels/src/cuda/layer_norm_kernels.cu
+++ b/lib/kernels/src/cuda/layer_norm_kernels.cu
@@ -48,6 +48,39 @@ LayerNormPerDeviceState::LayerNormPerDeviceState(
 namespace Kernels {
 namespace LayerNorm {
 
+//todo: this may have some problem.
+LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const & handle,
+                              Allocator const & allocator,
+                              bool elementwise_affine_,
+                              int64_t effective_batch_size_,
+                              int64_t effective_num_elements_,
+                              float eps_) {
+  elementwise_affine = elementwise_affine_;
+  effective_batch_size = effective_batch_size_;
+  effective_num_elements = effective_num_elements_;
+  eps = eps_;
+  mean = allocator.allocate(sizeof(float) * effective_batch_size);
+  rstd = allocator.allocate(sizeof(float) * effective_batch_size);
+  ds= allocator.allocate(sizeof(float) * effective_batch_size);
+  db = allocator.allocate(sizeof(float) * effective_batch_size);
+  scale= allocator.allocate(sizeof(float) * effective_batch_size);
+  bias = allocator.allocate(sizeof(float) * effective_batch_size);
+  LayerNormPerDeviceState per_device_state = LayerNormPerDeviceState(handle,
+                                                                     elementwise_affine,
+                                                                     effective_batch_size,
+                                                                     effective_num_elements,
+                                                                      eps,
+                                                                      mean,
+                                                                      rstd,
+                                                                      ds,
+                                                                      db,
+                                                                      scale,
+                                                                      bias);
+    return per_device_state;
+
+  }
+
+
 template <DataType T>
 struct ForwardKernel {
   void operator()(cudaStream_t stream,
@@ -137,7 +170,7 @@ struct BackwardKernel {
 }
 
 void forward_kernel(cudaStream_t stream,
-                    LayerNormPerDeviceState const *m,
+                    LayerNormPerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
                     GenericTensorAccessorW const &gamma,
@@ -147,7 +180,7 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     LayerNormPerDeviceState const *m,
+                     LayerNormPerDeviceState const &m,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &input_grad,
diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 98aabb6fc5..2380a4ceb5 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -16,6 +16,8 @@
 #include "layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
 #include "legion/legion_utilities.h"
+#include "op-attrs/ops/layer_norm.h"
+#include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
@@ -578,4 +580,71 @@ Op *LayerNorm::materialize(FFModel &ff,
       ff, params, inputs[0], this->name, true /*allocate_weights*/);
 }
 
+enum Slots {INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
+
+OpTaskInvocation init(LayerNormAttrs const & attrs) {
+  OpTaskBinding b;
+
+  b.bind_arg(HANDLE, ff_handle());
+  b.bind_arg(ATTRS, attrs);
+
+  return {LAYERNORM_INIT_TASK_ID, b};
+}
+
+static DeviceSpecific<LayerNormPerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+  auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
+  Allocator allocator = acc.get_allocator();
+  FFHandler handle = acc.get_argument<FFHandler>(HANDLE);
+  //question: how to get batch_size and effective_num_elements
+  int64_t effective_batch_size, effective_num_elements;
+
+  DeviceSpecific<LayerNormPerDeviceState> per_device_state = 
+      acc.create_device_specific<LayerNormPerDeviceState>(
+        init_kernel(handle,
+                    allocator,
+                    attrs.elementwise_affine,
+                    effective_batch_size,
+                    effective_num_elements,
+                    attrs.eps)
+      );
+}
+
+static DeviceSpecific<LayerNormPerDeviceState>  init_task(Task const *task,
+              std::vector<PhysicalRegion> const &regions,
+              Context ctx,
+              Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  return init_task_impl(acc);
+}
+
+CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
+                                  LayerNormAttrs const & attrs,
+                                  ParallelTensorShape const &input_shape,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &machine_view) {
+    auto env = sim.new_environment(); 
+    ParallelTensorShape output_shape =get_output_shape(attrs, input_shape);
+
+    SimTaskBinding init_binding;
+    init_binding.bind_arg(HANDLE, ff_handle());
+    init_binding.bind_arg(ATTRS, attrs);
+
+    auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
+
+     DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
+
+}
+
+template <>
+void register_task<LAYERNORM_INIT_TASK_ID>() {
+  OpTaskSignature init(OpTaskType::INIT);
+  init.add_arg_slot<LayerNormAttrs>(ATTRS); 
+  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+
+  init.add_return_value<LayerNormPerDeviceState>();
+
+  register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init, init_task);
+}
+
+
 }; // namespace FlexFlow

From 51c351b0031d88e025ee73b45c7d5f09a9d17d1d Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 19:29:06 +0000
Subject: [PATCH 2/9] implement the forward and backward

---
 lib/runtime/src/ops/layer_norm.cc | 110 +++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 1 deletion(-)

diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 2380a4ceb5..30ccd60a7e 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -19,6 +19,7 @@
 #include "op-attrs/ops/layer_norm.h"
 #include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
+#include <type_traits>
 
 namespace FlexFlow {
 
@@ -591,6 +592,86 @@ OpTaskInvocation init(LayerNormAttrs const & attrs) {
   return {LAYERNORM_INIT_TASK_ID, b};
 }
 
+OpTaskInvocation forward(LayerNormAttrs const & attrs) {
+  OpTaskBinding b;
+
+  b.bind(INPUT, input_tensor(0));
+  b.bind(OUTPUT, output_tensor(0));
+  b.bind(GAMMA, weight_tensor(0));//todo, this may have some problem
+  b.bind(BETA, weight_tensor(1));//how to get gmmam and beta
+  b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(PER_DEVICE_STATE, per_device_state<LayerNormPerDeviceState>());
+
+  return {LAYERNORM_FWD_TASK_ID, b};
+}
+
+OpTaskInvocation backward(LayerNormAttrs const & attrs) {
+  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
+
+  return {LAYERNORM_BWD_TASK_ID, b};
+}
+
+
+static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+    auto input = acc.get_tensor<Permission::RO>(INPUT);
+    auto output = acc.get_tensor<Permission::WO>(OUTPUT);
+    auto gamma = acc.get_tensor<Permission::WO>(GAMMA);
+    auto beta = acc.get_tensor<Permission::WO>(BETA);
+
+    ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+    auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
+
+    return profile(forward_kernel,
+                  profiling,
+                  "[LayerNorm] forward time = %.2lfms\n",
+                  state,
+                  input.get_float_ptr(),
+                  output.get_float_ptr(),
+                  gamma.get_float_ptr(),
+                  beta.get_float_ptr());
+}
+
+static void forward_task(Task const *task,
+                         std::vector<PhysicalRegion> const &regions,
+                         Context ctx,
+                         Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  forward_task_impl(acc);
+}
+
+
+static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permission::RO>(INPUT);
+  auto gamma = acc.get_tensor<Permission::RO>(GAMMA);
+
+  auto input_grad = acc.get_tensor<Permission::RW>(INPUT_GRAD);
+  auto gamma_grad = acc.get_tensor<Permission::RW>(GAMMA_GRAD);
+  auto beta_grad = acc.get_tensor<Permission::RW>(BETA_GRAD);
+  auto output_grad = acc.get_tensor<Permission::RO>(OUTPUT_GRAD);
+
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
+
+  return profile(backward_kernel,
+                  profiling,
+                  "[LayerNorm] backward time = %.2lfms\n",
+                  state,
+                  output_grad.get_float_ptr(),
+                  input.get_float_ptr(),
+                  input_grad.get_float_ptr(),
+                  gamma.get_float_ptr(),
+                  gamma_grad.get_float_ptr(),
+                  beta_grad.get_float_ptr());
+}
+
+static void backward_task(Task const *task,
+                          std::vector<PhysicalRegion> const &regions,
+                          Context ctx,
+                          Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  backward_task_impl(acc);
+}
+
 static DeviceSpecific<LayerNormPerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
@@ -631,7 +712,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
     auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
 
-     DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
+    DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
+
+    
 
 }
 
@@ -646,5 +729,30 @@ void register_task<LAYERNORM_INIT_TASK_ID>() {
   register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init, init_task);
 }
 
+template <>
+void register_task<LAYERNORM_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FWD);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+  //how to hande gamma and beta, this may have some problem
+  fwd.add_input_slot(GAMMA);
+  fwd.add_input_slot(BETA);
+
+  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_unchecked_arg_slot<LayerNormPerDeviceState>(PER_DEVICE_STATE);
+
+  register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd, forward_task);
+}
+
+template <>
+void register_task<LAYERNORM_BWD_TASK_ID>() {
+  OpTaskSignature bwd =
+      infer_bwd_signature(get_op_signature(LAYERNORM_FWD_TASK_ID));
+
+  register_task(LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task); 
+}
+
+
 
 }; // namespace FlexFlow

From bbe08aa4dc4ac1d0915d8bc7ed9d4192ff2d8939 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 19:31:02 +0000
Subject: [PATCH 3/9] layer norm version 0.1

---
 lib/runtime/src/ops/layer_norm.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 30ccd60a7e..335d542ffc 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -714,8 +714,21 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
     DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
 
-    
+    SimTaskBinding fwd_binding;
+    fwd_binding.bind(INPUT, input_shape);
+    fwd_binding.bind(OUTPUT, output_shape);
+    //TODO how to handle gamma and beta, where are they from
 
+    SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+
+    auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);
+    auto bwd_accessor = env.get_bwd_accessor(LAYERNORM_BWD_TASK_ID, bwd_binding);
+
+    float forward_time = forward_task_impl(fwd_accessor).value();
+    float backward_time = backward_task_impl(bwd_accessor).value();
+
+    float sync_time = default_estimate_sync_time(env);
+    return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
 template <>

From 1e52a7748d80419d01a40488ceedeab81a607cff Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Mon, 9 Oct 2023 19:33:07 +0000
Subject: [PATCH 4/9]  layer norm draft

---
 .../include/kernels/layer_norm_kernels.h      |  26 +-
 lib/kernels/src/cuda/layer_norm_kernels.cu    |  47 +-
 lib/runtime/src/ops/layer_norm.cc             | 695 ++----------------
 3 files changed, 103 insertions(+), 665 deletions(-)

diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index cd07a6878c..3a998a74a5 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 
-#include "kernels/device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
 namespace FlexFlow {
@@ -26,11 +26,11 @@ class LayerNormPerDeviceState : public PerDeviceOpState {
 };
 
 struct LayerNormPerDeviceState {
-    bool elementwise_affine;
-    int64_t effective_batch_size, effective_num_elements;
-    float eps;
-    float *mean, *rstd, *ds, *db, *scale, *bias;
-    DataType data_type;
+  bool elementwise_affine;
+  int64_t effective_batch_size, effective_num_elements;
+  float eps;
+  float *mean, *rstd, *ds, *db, *scale, *bias;
+  DataType data_type;
 };
 
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
@@ -46,18 +46,16 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
                                              bias,
                                              data_type);
 
-
 namespace Kernels {
 namespace LayerNorm {
 
-//todo: this may have some problem.
+// todo: this may have some problem.
 LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &,
-                              Allocator const &,
-                              bool elementwise_affine,
-                              int64_t effective_batch_size,
-                              int64_t effective_num_elements,
-                              float eps);
-
+                                    Allocator const &,
+                                    bool elementwise_affine,
+                                    int64_t effective_batch_size,
+                                    int64_t effective_num_elements,
+                                    float eps);
 
 void forward_kernel(ffStream_t stream,
                     LayerNormPerDeviceState const &m,
diff --git a/lib/kernels/src/cuda/layer_norm_kernels.cu b/lib/kernels/src/cuda/layer_norm_kernels.cu
index 7dc447d511..f8331abe29 100644
--- a/lib/kernels/src/cuda/layer_norm_kernels.cu
+++ b/lib/kernels/src/cuda/layer_norm_kernels.cu
@@ -48,38 +48,37 @@ LayerNormPerDeviceState::LayerNormPerDeviceState(
 namespace Kernels {
 namespace LayerNorm {
 
-//todo: this may have some problem.
-LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const & handle,
-                              Allocator const & allocator,
-                              bool elementwise_affine_,
-                              int64_t effective_batch_size_,
-                              int64_t effective_num_elements_,
-                              float eps_) {
+// todo: this may have some problem.
+LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
+                                    Allocator const &allocator,
+                                    bool elementwise_affine_,
+                                    int64_t effective_batch_size_,
+                                    int64_t effective_num_elements_,
+                                    float eps_) {
   elementwise_affine = elementwise_affine_;
   effective_batch_size = effective_batch_size_;
   effective_num_elements = effective_num_elements_;
   eps = eps_;
   mean = allocator.allocate(sizeof(float) * effective_batch_size);
   rstd = allocator.allocate(sizeof(float) * effective_batch_size);
-  ds= allocator.allocate(sizeof(float) * effective_batch_size);
+  ds = allocator.allocate(sizeof(float) * effective_batch_size);
   db = allocator.allocate(sizeof(float) * effective_batch_size);
-  scale= allocator.allocate(sizeof(float) * effective_batch_size);
+  scale = allocator.allocate(sizeof(float) * effective_batch_size);
   bias = allocator.allocate(sizeof(float) * effective_batch_size);
-  LayerNormPerDeviceState per_device_state = LayerNormPerDeviceState(handle,
-                                                                     elementwise_affine,
-                                                                     effective_batch_size,
-                                                                     effective_num_elements,
-                                                                      eps,
-                                                                      mean,
-                                                                      rstd,
-                                                                      ds,
-                                                                      db,
-                                                                      scale,
-                                                                      bias);
-    return per_device_state;
-
-  }
-
+  LayerNormPerDeviceState per_device_state =
+      LayerNormPerDeviceState(handle,
+                              elementwise_affine,
+                              effective_batch_size,
+                              effective_num_elements,
+                              eps,
+                              mean,
+                              rstd,
+                              ds,
+                              db,
+                              scale,
+                              bias);
+  return per_device_state;
+}
 
 template <DataType T>
 struct ForwardKernel {
diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 335d542ffc..7757f726e3 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -17,573 +17,15 @@
 #include "kernels/layer_norm_kernels.h"
 #include "legion/legion_utilities.h"
 #include "op-attrs/ops/layer_norm.h"
-#include "utils/exception.decl.h"
+#include "utils/exceptions.h"
 #include "utils/hash-utils.h"
 #include <type_traits>
 
 namespace FlexFlow {
 
-// declare Legion names
-using Legion::ArgumentMap;
-using Legion::Context;
-using Legion::coord_t;
-using Legion::Domain;
-using Legion::FutureMap;
-using Legion::IndexLauncher;
-using Legion::InlineLauncher;
-using Legion::Machine;
-using Legion::Memory;
-using Legion::PhysicalRegion;
-using Legion::Predicate;
-using Legion::Rect;
-using Legion::RegionRequirement;
-using Legion::Runtime;
-using Legion::Task;
-using Legion::TaskArgument;
-using Legion::TaskLauncher;
-
-using namespace FlexFlow::Kernels::LayerNorm;
-
-LayerNormParams LayerNorm::get_params() const {
-  LayerNormParams params;
-  params.layer_guid = this->layer_guid;
-  params.axes = this->axes;
-  params.elementwise_affine = this->elementwise_affine;
-  params.eps = this->eps;
-  return params;
-}
-
-Tensor FFModel::layer_norm(const Tensor input,
-                           std::vector<int> const &axes,
-                           bool elementwise_affine,
-                           float eps,
-                           char const *name) {
-  // FIXME: currently disable elementwise_affine
-  elementwise_affine = false;
-  // axes must be the last axes.size() dimensions
-  for (int i = 0; i < axes.size(); i++) {
-    bool found = false;
-    for (int j = 0; j < axes.size(); j++) {
-      if (axes[j] == input->num_dims - 1 - i) {
-        found = true;
-      }
-    }
-    if (!found) {
-      assert(false && "axes must be the last axes.size() dimensions");
-    }
-  }
-  int num_weights = elementwise_affine ? 2 : 0;
-  Layer *ln = new Layer(this,
-                        OP_LAYERNORM,
-                        DT_FLOAT,
-                        name,
-                        1 /*inputs*/,
-                        num_weights,
-                        1 /*outputs*/,
-                        input);
-  ln->outputs[0] = create_tensor_legion_ordering(input->num_dims,
-                                                 input->dims,
-                                                 input->data_type,
-                                                 ln,
-                                                 0,
-                                                 true /*create_grad*/);
-  if (num_weights == 2) {
-    int M = 1;
-    for (int i = 0; i < axes.size(); i++) {
-      M *= input->dims[input->num_dims - 1 - axes[i]];
-    }
-    int dims[1] = {M};
-    ln->weights[0] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   input->data_type,
-                                                   ln,
-                                                   true /*create_grad*/,
-                                                   nullptr,
-                                                   CHOSEN_SYNC_TYPE);
-    ln->weights[1] = create_weight_legion_ordering(1,
-                                                   dims,
-                                                   input->data_type,
-                                                   ln,
-                                                   true /*create_grad*/,
-                                                   nullptr,
-                                                   CHOSEN_SYNC_TYPE);
-  }
-  ln->add_int_property("elementwise_affine", elementwise_affine);
-  ln->add_int_vector_property("axes", axes);
-  ln->add_float_property("eps", eps);
-  layers.push_back(ln);
-  return ln->outputs[0];
-}
-
-Op *LayerNorm::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  long long value;
-  layer->get_int_property("elementwise_affine", value);
-  bool elementwise_affine = (bool)value;
-  std::vector<int> axes;
-  layer->get_int_vector_property("axes", axes);
-  float eps;
-  layer->get_float_property("eps", eps);
-  return new LayerNorm(model,
-                       layer->layer_guid,
-                       inputs[0],
-                       axes,
-                       elementwise_affine,
-                       eps,
-                       false, // allocate_weights
-                       layer->name);
-}
-
-LayerNorm::LayerNorm(FFModel &model,
-                     LayerNormParams const &params,
-                     ParallelTensor const input,
-                     char const *name,
-                     bool allocate_weights)
-    : LayerNorm(model,
-                params.layer_guid,
-                input,
-                params.axes,
-                params.elementwise_affine,
-                params.eps,
-                allocate_weights,
-                name) {}
-
-LayerNorm::LayerNorm(FFModel &model,
-                     LayerID const &_layer_guid,
-                     const ParallelTensor _input,
-                     std::vector<int> const &_axes,
-                     bool _elementwise_affine,
-                     float _eps,
-                     bool allocate_weights,
-                     char const *name)
-    : Op(model,
-         OP_LAYERNORM,
-         _input->data_type,
-         name,
-         1 /*inputs*/,
-         _elementwise_affine ? 2 : 0 /*weights*/,
-         1 /*outputs*/,
-         _input),
-      elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes) {
-  // overwrite layer_guid
-  layer_guid = _layer_guid;
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      _input->num_dims, _input->dims, _input->data_type, this);
-  assert(check_output_input_weight_parallel_dims(allocate_weights));
-  ParallelDim output_dims[MAX_TENSOR_DIM];
-  int M = 1;
-  for (int i = 0; i < axes.size(); i++) {
-    M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size;
-  }
-  effective_num_elements = M;
-  effective_batch_size = inputs[0]->get_volume() / M;
-  if (numWeights > 0 && allocate_weights) {
-    int kernel_dims = 2;
-    assert(false);
-    // weights[0] = model.create_parallel_weight_legion_ordering(
-    //     kernel_dims,
-  } else {
-    // do nothing
-  }
-  return;
-}
-
-void LayerNorm::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(LAYERNORM_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(LayerNorm)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
-
-PerDeviceOpState *
-    LayerNorm::init_task(Task const *task,
-                         std::vector<PhysicalRegion> const &regions,
-                         Context ctx,
-                         Runtime *runtime) {
-  LayerNorm *ln = (LayerNorm *)task->args;
-  FFHandler handle = *((FFHandler const *)task->local_args);
-  LayerNormMeta *meta = new LayerNormMeta(handle, ln);
-  return meta;
-}
-
-void LayerNorm::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(LAYERNORM_FWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  if (elementwise_affine) {
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_WRITE,
-                                                      EXCLUSIVE,
-                                                      weights[0]->region));
-    launcher.add_field(2, FID_DATA);
-    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
-                                                      0 /*projection id*/,
-                                                      READ_WRITE,
-                                                      EXCLUSIVE,
-                                                      weights[1]->region));
-    launcher.add_field(3, FID_DATA);
-  }
-  runtime->execute_index_space(ctx, launcher);
-}
-
-/*
-  regions[0](I): input
-  regions[1](O): output
-  regions[2](I/O): gamma
-  regions[3](I/O): beta
-*/
-void LayerNorm::forward_task(Task const *task,
-                             std::vector<PhysicalRegion> const &regions,
-                             Context ctx,
-                             Runtime *runtime) {
-  LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
-  assert(task->regions.size() == regions.size());
-  float const *in_ptr = NULL;
-  float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL;
-  Domain in_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  in_ptr = helperGetTensorPointerRO<float>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  Domain out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  out_ptr = helperGetTensorPointerWO<float>(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  assert(in_domain == out_domain);
-  assert(in_domain.get_volume() ==
-         m->effective_num_elements * m->effective_batch_size);
-  if (m->elementwise_affine) {
-    assert(regions.size() == 4);
-    Domain gamma_domain = runtime->get_index_space_domain(
-        ctx, task->regions[2].region.get_index_space());
-    gamma_ptr = helperGetTensorPointerRW<float>(
-        regions[2], task->regions[2], FID_DATA, ctx, runtime);
-    Domain beta_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    beta_ptr = helperGetTensorPointerRW<float>(
-        regions[3], task->regions[3], FID_DATA, ctx, runtime);
-    assert(gamma_domain == beta_domain);
-    assert(gamma_domain.get_volume() == m->effective_num_elements);
-  } else {
-    assert(regions.size() == 2);
-  }
-
-  forward_kernel_wrapper<float>(m, in_ptr, out_ptr, gamma_ptr, beta_ptr);
-}
-
-void LayerNorm::backward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
-  IndexLauncher launcher(LAYERNORM_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  // regions[0](I): output_grad
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  // regions[1](I): input
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  // regions[2](I/O): input_grad
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
-  launcher.add_field(2, FID_DATA);
-  if (elementwise_affine) {
-    // regions[3](I): gamma
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                      0 /*projection id*/,
-                                                      READ_ONLY,
-                                                      EXCLUSIVE,
-                                                      weights[0]->region));
-    launcher.add_field(3, FID_DATA);
-    // regions[4](I/O): gamma_grad
-    launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
-                                                      0 /*projection id*/,
-                                                      READ_WRITE,
-                                                      EXCLUSIVE,
-                                                      weights[0]->region_grad));
-    launcher.add_field(4, FID_DATA);
-    // regions[5](I/O): beta_grad
-    launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad,
-                                                      0 /*projection id*/,
-                                                      READ_WRITE,
-                                                      EXCLUSIVE,
-                                                      weights[1]->region_grad));
-    launcher.add_field(5, FID_DATA);
-  }
-  runtime->execute_index_space(ctx, launcher);
-}
+enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
 
-/*
-  regions[0](I): output_grad
-  regions[1](I): input
-  regions[2](I/O): input_grad
-  regions[3](I): gamma
-  regions[4](I/O): gamma_grad
-  regions[5](I/O): beta_grad
-   */
-void LayerNorm::backward_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
-  assert(task->regions.size() == regions.size());
-  float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL;
-  float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
-  Domain out_grad_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  out_grad_ptr = helperGetTensorPointerRO<float>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  Domain in_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  in_ptr = helperGetTensorPointerRO<float>(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  Domain in_grad_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
-  in_grad_ptr = helperGetTensorPointerRW<float>(
-      regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  assert(in_domain == out_grad_domain);
-  assert(in_domain.get_volume() ==
-         m->effective_num_elements * m->effective_batch_size);
-  if (m->elementwise_affine) {
-    assert(regions.size() == 6);
-    Domain gamma_domain = runtime->get_index_space_domain(
-        ctx, task->regions[3].region.get_index_space());
-    gamma_ptr = helperGetTensorPointerRO<float>(
-        regions[3], task->regions[3], FID_DATA, ctx, runtime);
-    Domain gamma_grad_domain = runtime->get_index_space_domain(
-        ctx, task->regions[4].region.get_index_space());
-    gamma_grad_ptr = helperGetTensorPointerRW<float>(
-        regions[4], task->regions[4], FID_DATA, ctx, runtime);
-    Domain beta_grad_domain = runtime->get_index_space_domain(
-        ctx, task->regions[5].region.get_index_space());
-    beta_grad_ptr = helperGetTensorPointerRW<float>(
-        regions[5], task->regions[5], FID_DATA, ctx, runtime);
-    assert(gamma_domain == gamma_grad_domain);
-    assert(gamma_domain == beta_grad_domain);
-    assert(gamma_domain.get_volume() == m->effective_num_elements);
-  } else {
-    assert(regions.size() == 3);
-  }
-
-  backward_kernel_wrapper<float>(m,
-                                 out_grad_ptr,
-                                 in_ptr,
-                                 in_grad_ptr,
-                                 gamma_ptr,
-                                 gamma_grad_ptr,
-                                 beta_grad_ptr);
-}
-
-bool LayerNorm::measure_operator_cost(Simulator *sim,
-                                      MachineView const &mv,
-                                      CostMetrics &cost_metrics) const {
-  ParallelTensorBase sub_output, sub_input;
-  if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-    return false;
-  }
-  if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-    return false;
-  }
-  LayerNormMeta *m = new LayerNormMeta(sim->handler, this);
-
-  sim->free_all();
-  float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-  assert(in_ptr != NULL);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-
-  float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-  assert(out_ptr != NULL);
-  cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-
-  // FIXME please add gamma_ptr and beta_ptr after finish the implementation
-  float *gamma_ptr = NULL, *beta_ptr = NULL;
-
-  bool out_of_memory =
-      (in_ptr == NULL) || (out_ptr == NULL) ||
-      (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine));
-  if (out_of_memory) {
-    cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-    cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-    return true;
-  }
-
-  std::function<void()> forward, backward;
-  forward = [&] {
-    forward_kernel_wrapper(m, in_ptr, out_ptr, gamma_ptr, beta_ptr);
-  };
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    float *in_grad_ptr =
-        (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-    assert(in_grad_ptr != NULL);
-    cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-
-    float *out_grad_ptr = NULL;
-    out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-    assert(out_grad_ptr != NULL);
-    cost_metrics.outputs_memory +=
-        cost_metrics.total_mem_diff_from(sim->offset);
-
-    float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
-
-    out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) ||
-                    (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) &&
-                     (m->elementwise_affine));
-    if (out_of_memory) {
-      cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-      cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME;
-      return true;
-    }
-
-    backward = [&] {
-      backward_kernel_wrapper<float>(m,
-                                     out_grad_ptr,
-                                     in_ptr,
-                                     in_grad_ptr,
-                                     gamma_ptr,
-                                     gamma_grad_ptr,
-                                     beta_grad_ptr);
-    };
-  }
-
-  inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) "
-                      "forward_time(%.4lf) backward_time(%.4lf)\n",
-                      name,
-                      sub_output.get_volume(),
-                      cost_metrics.forward_time,
-                      cost_metrics.backward_time);
-  } else {
-    log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) "
-                      "forward_time(%.4lf)\n",
-                      name,
-                      sub_output.get_volume(),
-                      cost_metrics.forward_time);
-  }
-
-  return true;
-}
-
-void LayerNorm::serialize(Legion::Serializer &sez) const {
-  sez.serialize(this->layer_guid.id);
-  sez.serialize(this->axes.size());
-  for (size_t i = 0; i < this->axes.size(); i++) {
-    sez.serialize(this->axes[i]);
-  }
-  sez.serialize(this->elementwise_affine);
-  sez.serialize(this->eps);
-}
-
-using PCG::Node;
-/*static*/
-Node LayerNorm::deserialize(FFModel &ff,
-                            Legion::Deserializer &dez,
-                            ParallelTensor inputs[],
-                            int num_inputs) {
-  assert(num_inputs == 1);
-  size_t num_axes;
-  std::vector<int> axes;
-  bool elementwise_affine;
-  float eps;
-  size_t id;
-  dez.deserialize(id);
-  LayerID layer_guid(id);
-  dez.deserialize(num_axes);
-  for (size_t i = 0; i < num_axes; i++) {
-    int axis_idx;
-    dez.deserialize(axis_idx);
-    axes.push_back(axis_idx);
-  }
-  dez.deserialize(elementwise_affine);
-  dez.deserialize(eps);
-
-  LayerNormParams params;
-  params.layer_guid = layer_guid;
-  params.axes = axes;
-  params.elementwise_affine = elementwise_affine;
-  params.eps = eps;
-  return ff.get_or_create_node<LayerNorm>(inputs[0], params);
-}
-
-Op *LayerNorm::materialize(FFModel &ff,
-                           ParallelTensor inputs[],
-                           int num_inputs) const {
-  LayerNormParams params = get_params();
-  return new LayerNorm(
-      ff, params, inputs[0], this->name, true /*allocate_weights*/);
-}
-
-enum Slots {INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
-
-OpTaskInvocation init(LayerNormAttrs const & attrs) {
+OpTaskInvocation init(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
 
   b.bind_arg(HANDLE, ff_handle());
@@ -592,43 +34,42 @@ OpTaskInvocation init(LayerNormAttrs const & attrs) {
   return {LAYERNORM_INIT_TASK_ID, b};
 }
 
-OpTaskInvocation forward(LayerNormAttrs const & attrs) {
+OpTaskInvocation forward(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
 
   b.bind(INPUT, input_tensor(0));
   b.bind(OUTPUT, output_tensor(0));
-  b.bind(GAMMA, weight_tensor(0));//todo, this may have some problem
-  b.bind(BETA, weight_tensor(1));//how to get gmmam and beta
+  b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem
+  b.bind(BETA, weight_tensor(1));  // how to get gmmam and beta
   b.bind_arg(PROFILING, profiling_settings());
   b.bind_arg(PER_DEVICE_STATE, per_device_state<LayerNormPerDeviceState>());
 
   return {LAYERNORM_FWD_TASK_ID, b};
 }
 
-OpTaskInvocation backward(LayerNormAttrs const & attrs) {
+OpTaskInvocation backward(LayerNormAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
   return {LAYERNORM_BWD_TASK_ID, b};
 }
 
-
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-    auto input = acc.get_tensor<Permission::RO>(INPUT);
-    auto output = acc.get_tensor<Permission::WO>(OUTPUT);
-    auto gamma = acc.get_tensor<Permission::WO>(GAMMA);
-    auto beta = acc.get_tensor<Permission::WO>(BETA);
-
-    ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-    auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
-
-    return profile(forward_kernel,
-                  profiling,
-                  "[LayerNorm] forward time = %.2lfms\n",
-                  state,
-                  input.get_float_ptr(),
-                  output.get_float_ptr(),
-                  gamma.get_float_ptr(),
-                  beta.get_float_ptr());
+  auto input = acc.get_tensor<Permission::RO>(INPUT);
+  auto output = acc.get_tensor<Permission::WO>(OUTPUT);
+  auto gamma = acc.get_tensor<Permission::WO>(GAMMA);
+  auto beta = acc.get_tensor<Permission::WO>(BETA);
+
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[LayerNorm] forward time = %.2lfms\n",
+                 state,
+                 input.get_float_ptr(),
+                 output.get_float_ptr(),
+                 gamma.get_float_ptr(),
+                 beta.get_float_ptr());
 }
 
 static void forward_task(Task const *task,
@@ -639,7 +80,6 @@ static void forward_task(Task const *task,
   forward_task_impl(acc);
 }
 
-
 static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permission::RO>(INPUT);
   auto gamma = acc.get_tensor<Permission::RO>(GAMMA);
@@ -653,15 +93,15 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(backward_kernel,
-                  profiling,
-                  "[LayerNorm] backward time = %.2lfms\n",
-                  state,
-                  output_grad.get_float_ptr(),
-                  input.get_float_ptr(),
-                  input_grad.get_float_ptr(),
-                  gamma.get_float_ptr(),
-                  gamma_grad.get_float_ptr(),
-                  beta_grad.get_float_ptr());
+                 profiling,
+                 "[LayerNorm] backward time = %.2lfms\n",
+                 state,
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
+                 gamma.get_float_ptr(),
+                 gamma_grad.get_float_ptr(),
+                 beta_grad.get_float_ptr());
 }
 
 static void backward_task(Task const *task,
@@ -672,25 +112,26 @@ static void backward_task(Task const *task,
   backward_task_impl(acc);
 }
 
-static DeviceSpecific<LayerNormPerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+static DeviceSpecific<LayerNormPerDeviceState>
+    init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
   FFHandler handle = acc.get_argument<FFHandler>(HANDLE);
-  //question: how to get batch_size and effective_num_elements
+  // question: how to get batch_size and effective_num_elements
   int64_t effective_batch_size, effective_num_elements;
 
-  DeviceSpecific<LayerNormPerDeviceState> per_device_state = 
+  DeviceSpecific<LayerNormPerDeviceState> per_device_state =
       acc.create_device_specific<LayerNormPerDeviceState>(
-        init_kernel(handle,
-                    allocator,
-                    attrs.elementwise_affine,
-                    effective_batch_size,
-                    effective_num_elements,
-                    attrs.eps)
-      );
+          init_kernel(handle,
+                      allocator,
+                      attrs.elementwise_affine,
+                      effective_batch_size,
+                      effective_num_elements,
+                      attrs.eps));
 }
 
-static DeviceSpecific<LayerNormPerDeviceState>  init_task(Task const *task,
+static DeviceSpecific<LayerNormPerDeviceState>
+    init_task(Task const *task,
               std::vector<PhysicalRegion> const &regions,
               Context ctx,
               Runtime *runtime) {
@@ -699,42 +140,43 @@ static DeviceSpecific<LayerNormPerDeviceState>  init_task(Task const *task,
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
-                                  LayerNormAttrs const & attrs,
+                                  LayerNormAttrs const &attrs,
                                   ParallelTensorShape const &input_shape,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-    auto env = sim.new_environment(); 
-    ParallelTensorShape output_shape =get_output_shape(attrs, input_shape);
+  auto env = sim.new_environment();
+  ParallelTensorShape output_shape = get_output_shape(attrs, input_shape);
 
-    SimTaskBinding init_binding;
-    init_binding.bind_arg(HANDLE, ff_handle());
-    init_binding.bind_arg(ATTRS, attrs);
+  SimTaskBinding init_binding;
+  init_binding.bind_arg(HANDLE, ff_handle());
+  init_binding.bind_arg(ATTRS, attrs);
 
-    auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
+  auto init_accessor =
+      env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
 
-    DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
+  DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
 
-    SimTaskBinding fwd_binding;
-    fwd_binding.bind(INPUT, input_shape);
-    fwd_binding.bind(OUTPUT, output_shape);
-    //TODO how to handle gamma and beta, where are they from
+  SimTaskBinding fwd_binding;
+  fwd_binding.bind(INPUT, input_shape);
+  fwd_binding.bind(OUTPUT, output_shape);
+  // TODO how to handle gamma and beta, where are they from
 
-    SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+  SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-    auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);
-    auto bwd_accessor = env.get_bwd_accessor(LAYERNORM_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(LAYERNORM_BWD_TASK_ID, bwd_binding);
 
-    float forward_time = forward_task_impl(fwd_accessor).value();
-    float backward_time = backward_task_impl(bwd_accessor).value();
+  float forward_time = forward_task_impl(fwd_accessor).value();
+  float backward_time = backward_task_impl(bwd_accessor).value();
 
-    float sync_time = default_estimate_sync_time(env);
-    return make_metrics(forward_time, backward_time, sync_time, env);
+  float sync_time = default_estimate_sync_time(env);
+  return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
 template <>
 void register_task<LAYERNORM_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
-  init.add_arg_slot<LayerNormAttrs>(ATTRS); 
+  init.add_arg_slot<LayerNormAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
   init.add_return_value<LayerNormPerDeviceState>();
@@ -748,7 +190,7 @@ void register_task<LAYERNORM_FWD_TASK_ID>() {
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
-  //how to hande gamma and beta, this may have some problem
+  // todo how to hande gamma and beta, this may have some problem
   fwd.add_input_slot(GAMMA);
   fwd.add_input_slot(BETA);
 
@@ -763,9 +205,8 @@ void register_task<LAYERNORM_BWD_TASK_ID>() {
   OpTaskSignature bwd =
       infer_bwd_signature(get_op_signature(LAYERNORM_FWD_TASK_ID));
 
-  register_task(LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task); 
+  register_task(
+      LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task);
 }
 
-
-
 }; // namespace FlexFlow

From ea9bfa5607cbca09722e40aedf228d7b220cacce Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 11 Oct 2023 15:22:38 +0000
Subject: [PATCH 5/9]  fix the layer norm and leave beta, gamma

---
 lib/runtime/src/ops/layer_norm.cc | 35 ++++++++++++++++++++++++++-----
 lib/runtime/src/ops/layer_norm.h  |  2 +-
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 7757f726e3..129857080e 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -17,10 +17,16 @@
 #include "kernels/layer_norm_kernels.h"
 #include "legion/legion_utilities.h"
 #include "op-attrs/ops/layer_norm.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exceptions.h"
 #include "utils/hash-utils.h"
 #include <type_traits>
 
+using Legion::Context;
+using Legion::PhysicalRegion;
+using Legion::Runtime;
+using Legion::Task;
+
 namespace FlexFlow {
 
 enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
@@ -28,6 +34,8 @@ enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
 OpTaskInvocation init(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
 
+  b.bind(INPUT, input_tensor(0));
+
   b.bind_arg(HANDLE, ff_handle());
   b.bind_arg(ATTRS, attrs);
 
@@ -116,9 +124,21 @@ static DeviceSpecific<LayerNormPerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
+  auto input = acc.get_tensor<Permission::RO>(INPUT);
   FFHandler handle = acc.get_argument<FFHandler>(HANDLE);
+
   // question: how to get batch_size and effective_num_elements
   int64_t effective_batch_size, effective_num_elements;
+  int M = 1;
+  for (int i = 0; i < attrs.axes.size(); i++) {
+    M *= input.shape.at(legion_dim_t(attrs.axes[i]));
+  }
+  int num_replicas = 1;
+  for (int i = 0; i < intput.shape.num_dims(); i++) {
+    num_replicas *= input.shape.at(legion_dim_t(i));
+  }
+  effective_num_elements = M;
+  effective_batch_size = input.shape.get_volume() / num_replicas / M;
 
   DeviceSpecific<LayerNormPerDeviceState> per_device_state =
       acc.create_device_specific<LayerNormPerDeviceState>(
@@ -141,15 +161,16 @@ static DeviceSpecific<LayerNormPerDeviceState>
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   LayerNormAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
   auto env = sim.new_environment();
-  ParallelTensorShape output_shape = get_output_shape(attrs, input_shape);
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
 
   SimTaskBinding init_binding;
   init_binding.bind_arg(HANDLE, ff_handle());
   init_binding.bind_arg(ATTRS, attrs);
+  init.binding.bind(INPUT, input.shape);
 
   auto init_accessor =
       env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding);
@@ -157,8 +178,11 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   DeviceSpecific<LayerNormPerDeviceState> = init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
-  fwd_binding.bind(INPUT, input_shape);
+  fwd_binding.bind(INPUT, input.shape);
   fwd_binding.bind(OUTPUT, output_shape);
+  fwd_binding.bind_arg(PROFILING, settings);
+  fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
+
   // TODO how to handle gamma and beta, where are they from
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
@@ -176,6 +200,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 template <>
 void register_task<LAYERNORM_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
+  init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
@@ -191,8 +216,8 @@ void register_task<LAYERNORM_FWD_TASK_ID>() {
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   // todo how to hande gamma and beta, this may have some problem
-  fwd.add_input_slot(GAMMA);
-  fwd.add_input_slot(BETA);
+  fwd.add_weight_slot(GAMMA);
+  fwd.add_weight_slot(BETA);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<LayerNormPerDeviceState>(PER_DEVICE_STATE);
diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/runtime/src/ops/layer_norm.h
index 3cae9e3eb7..a9b00ee4e3 100644
--- a/lib/runtime/src/ops/layer_norm.h
+++ b/lib/runtime/src/ops/layer_norm.h
@@ -20,7 +20,7 @@ OpTaskInvocation backward(LayerNormAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   LayerNormAttrs const &,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input_shape,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 

From 146095bde97b4b6ad264e1bafafdec2e9dc130b0 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 17 Jan 2024 13:47:27 -0800
Subject: [PATCH 6/9] Finish layer norm

---
 lib/runtime/src/ops/layer_norm.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 129857080e..93fc9899a7 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -64,8 +64,8 @@ OpTaskInvocation backward(LayerNormAttrs const &attrs) {
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permission::RO>(INPUT);
   auto output = acc.get_tensor<Permission::WO>(OUTPUT);
-  auto gamma = acc.get_tensor<Permission::WO>(GAMMA);
-  auto beta = acc.get_tensor<Permission::WO>(BETA);
+  auto gamma = acc.get_tensor<Permission::RW>(GAMMA);
+  auto beta = acc.get_tensor<Permission::RW>(BETA);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
@@ -136,9 +136,8 @@ static DeviceSpecific<LayerNormPerDeviceState>
   int num_replicas = 1;
   for (int i = 0; i < intput.shape.num_dims(); i++) {
     num_replicas *= input.shape.at(legion_dim_t(i));
-  }
   effective_num_elements = M;
-  effective_batch_size = input.shape.get_volume() / num_replicas / M;
+  effective_batch_size = input.shape.get_volume() / M;
 
   DeviceSpecific<LayerNormPerDeviceState> per_device_state =
       acc.create_device_specific<LayerNormPerDeviceState>(
@@ -184,7 +183,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
   // TODO how to handle gamma and beta, where are they from
-
+fwd_binding.bind(GAMMA, input_shape);
+fwd_binding.bind(BETA, input_shape);
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
   auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);

From 0de86209c799d70899e34ea6f8fecc78fcad9791 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 7 Feb 2024 08:55:20 -0500
Subject: [PATCH 7/9] refine the  layernorm

---
 deps/fmt                          |  2 +-
 lib/runtime/src/ops/layer_norm.cc | 54 +++++++++++++++++++------------
 lib/runtime/src/ops/layer_norm.h  |  2 +-
 3 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/deps/fmt b/deps/fmt
index a33701196a..f5e54359df 160000
--- a/deps/fmt
+++ b/deps/fmt
@@ -1 +1 @@
-Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50
+Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 93fc9899a7..26dc049d4d 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -29,7 +29,7 @@ using Legion::Task;
 
 namespace FlexFlow {
 
-enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
+enum Slots { PROFILING, INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
 
 OpTaskInvocation init(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
@@ -148,6 +148,7 @@ static DeviceSpecific<LayerNormPerDeviceState>
                       effective_num_elements,
                       attrs.eps));
 }
+    }
 
 static DeviceSpecific<LayerNormPerDeviceState>
     init_task(Task const *task,
@@ -198,40 +199,53 @@ fwd_binding.bind(BETA, input_shape);
 }
 
 template <>
-void register_task<LAYERNORM_INIT_TASK_ID>() {
+OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FWD);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+  fwd.add_weight_slot(GAMMA);
+  fwd.add_weight_slot(BETA);
+
+  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_unchecked_arg_slot<LayerNormPerDeviceState>(PER_DEVICE_STATE);
+  return fwd;
+}
+
+
+template <>
+OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>()  {
+  OpTaskSignature bwd = infer_bwd_signature(fwd_signature<LAYERNORM_FWD_TASK_ID>());
+  return bwd;
+}
+
+template <>
+OpTaskSignature init_signatur<LAYERNORM_INIT_TASK_ID>()  {
   OpTaskSignature init(OpTaskType::INIT);
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
   init.add_return_value<LayerNormPerDeviceState>();
-
-  register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init, init_task);
+  return init;
 }
 
 template <>
-void register_task<LAYERNORM_FWD_TASK_ID>() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  // todo how to hande gamma and beta, this may have some problem
-  fwd.add_weight_slot(GAMMA);
-  fwd.add_weight_slot(BETA);
+void register_task<LAYERNORM_INIT_TASK_ID>() {
 
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<LayerNormPerDeviceState>(PER_DEVICE_STATE);
+  register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signatur<LAYERNORM_INIT_TASK_ID>(), init_task);
+}
 
-  register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd, forward_task);
+template <>
+void register_task<LAYERNORM_FWD_TASK_ID>() {
+  register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd_signature<LAYERNORM_FWD_TASK_ID>() , forward_task);
 }
 
 template <>
 void register_task<LAYERNORM_BWD_TASK_ID>() {
-  OpTaskSignature bwd =
-      infer_bwd_signature(get_op_signature(LAYERNORM_FWD_TASK_ID));
-
   register_task(
-      LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task);
+      LAYERNORM_BWD_TASK_ID, "LayerNorm backward",  bwd_signatur<AYERNORM_BWD_TASK_ID>() , backward_task);
+}
+
 }
 
-}; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/runtime/src/ops/layer_norm.h
index a9b00ee4e3..83e6733bf6 100644
--- a/lib/runtime/src/ops/layer_norm.h
+++ b/lib/runtime/src/ops/layer_norm.h
@@ -20,7 +20,7 @@ OpTaskInvocation backward(LayerNormAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   LayerNormAttrs const &,
-                                  InputParallelTensorDesc const &input_shape,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 

From 0cd1ca38b8271deb5a8afe3e2a2f58081fa8eb3c Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 7 Feb 2024 12:59:57 -0800
Subject: [PATCH 8/9] Apply suggestions from code review

---
 lib/runtime/src/ops/layer_norm.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 26dc049d4d..a380e4dc66 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -214,13 +214,13 @@ OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
 
 
 template <>
-OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>()  {
+OpTaskSignature bwd_signature<LAYERNORM_BWD_TASK_ID>()  {
   OpTaskSignature bwd = infer_bwd_signature(fwd_signature<LAYERNORM_FWD_TASK_ID>());
   return bwd;
 }
 
 template <>
-OpTaskSignature init_signatur<LAYERNORM_INIT_TASK_ID>()  {
+OpTaskSignature init_signature<LAYERNORM_INIT_TASK_ID>()  {
   OpTaskSignature init(OpTaskType::INIT);
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
@@ -233,7 +233,7 @@ OpTaskSignature init_signatur<LAYERNORM_INIT_TASK_ID>()  {
 template <>
 void register_task<LAYERNORM_INIT_TASK_ID>() {
 
-  register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signatur<LAYERNORM_INIT_TASK_ID>(), init_task);
+  register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signature<LAYERNORM_INIT_TASK_ID>(), init_task);
 }
 
 template <>
@@ -244,7 +244,7 @@ void register_task<LAYERNORM_FWD_TASK_ID>() {
 template <>
 void register_task<LAYERNORM_BWD_TASK_ID>() {
   register_task(
-      LAYERNORM_BWD_TASK_ID, "LayerNorm backward",  bwd_signatur<AYERNORM_BWD_TASK_ID>() , backward_task);
+      LAYERNORM_BWD_TASK_ID, "LayerNorm backward",  bwd_signature<AYERNORM_BWD_TASK_ID>() , backward_task);
 }
 
 }

From 8adb275e90e34423a8751d80638f1a23567ef9be Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 7 Feb 2024 18:21:04 -0500
Subject: [PATCH 9/9] fix the typo and format

---
 lib/runtime/src/ops/layer_norm.cc | 66 +++++++++++++++++++------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc
index 26dc049d4d..6bc671c249 100644
--- a/lib/runtime/src/ops/layer_norm.cc
+++ b/lib/runtime/src/ops/layer_norm.cc
@@ -29,7 +29,16 @@ using Legion::Task;
 
 namespace FlexFlow {
 
-enum Slots { PROFILING, INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE };
+enum Slots {
+  PROFILING,
+  INPUT,
+  OUTPUT,
+  GAMMA,
+  BETA,
+  PER_DEVICE_STATE,
+  ATTRS,
+  HANDLE
+};
 
 OpTaskInvocation init(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
@@ -136,19 +145,19 @@ static DeviceSpecific<LayerNormPerDeviceState>
   int num_replicas = 1;
   for (int i = 0; i < intput.shape.num_dims(); i++) {
     num_replicas *= input.shape.at(legion_dim_t(i));
-  effective_num_elements = M;
-  effective_batch_size = input.shape.get_volume() / M;
-
-  DeviceSpecific<LayerNormPerDeviceState> per_device_state =
-      acc.create_device_specific<LayerNormPerDeviceState>(
-          init_kernel(handle,
-                      allocator,
-                      attrs.elementwise_affine,
-                      effective_batch_size,
-                      effective_num_elements,
-                      attrs.eps));
+    effective_num_elements = M;
+    effective_batch_size = input.shape.get_volume() / M;
+
+    DeviceSpecific<LayerNormPerDeviceState> per_device_state =
+        acc.create_device_specific<LayerNormPerDeviceState>(
+            init_kernel(handle,
+                        allocator,
+                        attrs.elementwise_affine,
+                        effective_batch_size,
+                        effective_num_elements,
+                        attrs.eps));
+  }
 }
-    }
 
 static DeviceSpecific<LayerNormPerDeviceState>
     init_task(Task const *task,
@@ -184,8 +193,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
 
   // TODO how to handle gamma and beta, where are they from
-fwd_binding.bind(GAMMA, input_shape);
-fwd_binding.bind(BETA, input_shape);
+  fwd_binding.bind(GAMMA, input_shape);
+  fwd_binding.bind(BETA, input_shape);
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
   auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding);
@@ -212,15 +221,15 @@ OpTaskSignature fwd_signature<LAYERNORM_FWD_TASK_ID>() {
   return fwd;
 }
 
-
 template <>
-OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>()  {
-  OpTaskSignature bwd = infer_bwd_signature(fwd_signature<LAYERNORM_FWD_TASK_ID>());
+OpTaskSignature bwd_signature<AYERNORM_BWD_TASK_ID>() {
+  OpTaskSignature bwd =
+      infer_bwd_signature(fwd_signature<LAYERNORM_FWD_TASK_ID>());
   return bwd;
 }
 
 template <>
-OpTaskSignature init_signatur<LAYERNORM_INIT_TASK_ID>()  {
+OpTaskSignature init_signature<LAYERNORM_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
@@ -233,19 +242,26 @@ OpTaskSignature init_signatur<LAYERNORM_INIT_TASK_ID>()  {
 template <>
 void register_task<LAYERNORM_INIT_TASK_ID>() {
 
-  register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signatur<LAYERNORM_INIT_TASK_ID>(), init_task);
+  register_task(LAYERNORM_INIT_TASK_ID,
+                "LayerNorm init",
+                init_signature<LAYERNORM_INIT_TASK_ID>(),
+                init_task);
 }
 
 template <>
 void register_task<LAYERNORM_FWD_TASK_ID>() {
-  register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd_signature<LAYERNORM_FWD_TASK_ID>() , forward_task);
+  register_task(LAYERNORM_FWD_TASK_ID,
+                "LayerNorm forward",
+                fwd_signature<LAYERNORM_FWD_TASK_ID>(),
+                forward_task);
 }
 
 template <>
 void register_task<LAYERNORM_BWD_TASK_ID>() {
-  register_task(
-      LAYERNORM_BWD_TASK_ID, "LayerNorm backward",  bwd_signatur<AYERNORM_BWD_TASK_ID>() , backward_task);
-}
-
+  register_task(LAYERNORM_BWD_TASK_ID,
+                "LayerNorm backward",
+                bwd_signature<AYERNORM_BWD_TASK_ID>(),
+                backward_task);
 }
 
+} // namespace FlexFlow