From c5e320283a39d30e4cdf4737799eef3aa34fbcf7 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 8 Sep 2023 05:51:23 +0000
Subject: [PATCH 01/13] add init_kernel for reduction

---
 deps/fmt                                     |  2 +-
 lib/kernels/include/kernels/reduce_kernels.h | 29 +++++++++++---------
 lib/kernels/src/cuda/reduce_kernels.cu       |  9 ++++++
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/deps/fmt b/deps/fmt
index f5e54359df..a33701196a 160000
--- a/deps/fmt
+++ b/deps/fmt
@@ -1 +1 @@
-Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
+Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index aeea932f04..99251ee6dd 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -5,25 +5,28 @@
 
 namespace FlexFlow {
 
-class ReducePerDeviceState : public PerDeviceOpState {
-public:
-  ReducePerDeviceState(FFHandler handler,
-                       Reduce const *rd,
-                       Legion::Domain const &input_domain);
-  ~ReducePerDeviceState(void);
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-  cudnnTensorDescriptor_t inputTensor, outputTensor;
-  cudnnReduceTensorDescriptor_t reduceDesc;
-#else
-  miopenTensorDescriptor_t inputTensor, outputTensor;
-  miopenReduceTensorDescriptor_t reduceDesc;
-#endif
+struct ReducePerDeviceState {
+  PerDeviceFFHandle handle;
+  ffTensorDescriptor_t inputTensor;
+  ffTensorDescriptor_t outputTensor;
+  ffReduceTensorDescriptor_t reduceDesc;
   OperatorType op_type;
   size_t reduction_size;
 };
 
+FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState, handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size);
+
+
 namespace Kernels {
 namespace Reduce {
+
+ReducePerDeviceState init_kernel(PerDeviceFFhandle const &,
+                                ffTensorDescriptor_t const &,
+                                ffTensorDescriptor_t const &,
+                                ffReduceTensorDescriptor_t const &,
+                                OperatorType const &,
+                                size_t const &);
+
 void forward_kernel_wrapper(ReducePerDeviceState const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu
index d675f88073..245b0d06e5 100644
--- a/lib/kernels/src/cuda/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/reduce_kernels.cu
@@ -67,6 +67,15 @@ ReducePerDeviceState::~ReducePerDeviceState(void) {
 namespace Kernels {
 namespace Reduce {
 
+ReducePerDeviceState init_kernel(PerDeviceFFhandle const & handle,
+    ffTensorDescriptor_t const & input_tensor,
+    ffTensorDescriptor_t const & outputTensor,
+    ffReduceTensorDescriptor_t const & reduceDesc,
+    OperatorType const & op_type,
+    size_t const &  reduction_size) {
+      return {handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size};
+}
+
 void forward_kernel(cudaStream_t stream,
                     ReducePerDeviceState const *m,
                     float const *input_ptr,

From 2ce00f01e773c05f00417c8191167c79748fa642 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 8 Sep 2023 06:12:24 +0000
Subject: [PATCH 02/13] implement the init

---
 lib/runtime/src/ops/reduce.cc | 801 ++++++++++++++++++----------------
 lib/runtime/src/ops/reduce.h  |   4 +-
 2 files changed, 434 insertions(+), 371 deletions(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 8879cad592..5270006bc7 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -1,7 +1,9 @@
 #include "reduce.h"
 #include "kernels/reduce_kernels.h"
 #include "legion/legion_utilities.h"
+#include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
+#include "op-attrs/get_output_shape.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -45,391 +47,452 @@ ReduceParams Reduce::get_params() const {
   return params;
 }
 
-Tensor FFModel::reduce_sum(OperatorType op,
-                           const Tensor input,
-                           std::vector<int> const &_axes,
-                           bool keepdims,
-                           char const *name) {
-  Layer *rd = new Layer(this,
-                        op,
-                        DT_FLOAT,
-                        name,
-                        1 /*input*/,
-                        0 /*weights*/,
-                        1 /*outputs*/,
-                        input);
-  // Use Legion indexing to store axes
-  std::vector<int> axes;
-  for (size_t i = 0; i < _axes.size(); i++) {
-    axes.push_back(input->num_dims - 1 - _axes[i]);
-  }
-  int dims[MAX_TENSOR_DIM];
-  int numdim = input->num_dims;
-  if (keepdims) {
-    for (int i = 0; i < input->num_dims; i++) {
-      dims[i] = input->dims[i];
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      dims[axes[i]] = 1;
-    }
-  } else {
-    numdim = 0;
-    for (int i = 0; i < input->num_dims; i++) {
-      bool reduced = false;
-      for (size_t j = 0; j < axes.size(); j++) {
-        if (axes[j] == i) {
-          reduced = true;
-        }
-      }
-      if (!reduced) {
-        dims[numdim++] = input->dims[i];
-      }
-    }
-    assert(numdim + axes.size() == input->num_dims);
-  }
-  rd->outputs[0] = create_tensor_legion_ordering(
-      numdim, dims, input->data_type, rd, 0, true /*create_grad*/);
-  rd->add_int_vector_property("legion_axes", axes);
-  rd->add_int_property("keepdims", keepdims);
-  layers.push_back(rd);
-  return rd->outputs[0];
-}
+enum Slots {INPUT, OUTPUT, ATTRS, PROFILING, REDUCE, PER_DEVICE_STATE, HANDLE};
 
-Tensor FFModel::reduce_sum(const Tensor input,
-                           std::vector<int> const &_axes,
-                           bool keepdims,
-                           char const *name) {
-  return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name);
-}
+OpTaskInvocation init(TransposeAttrs const &attrs) {
+  OpTaskBinding binding;
 
-Tensor FFModel::reduce_mean(const Tensor input,
-                            std::vector<int> const &_axes,
-                            bool keepdims,
-                            char const *name) {
-  return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name);
-}
+  binding.bind_arg(HANDLE, ff_handle());.
+  binding.bind_arg(ATTRS, attrs);
 
-Op *Reduce::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  std::vector<int> axes;
-  long long value;
-  layer->get_int_vector_property("legion_axes", axes);
-  layer->get_int_property("keepdims", value);
-  bool keepdims = value;
-  return new Reduce(
-      model, layer->op_type, inputs[0], axes, keepdims, layer->name);
+  bind.bind(REDUCE, reduce_descriptor());//Note: how to bind ffReduceTensorDescriptor_t ?
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 }
 
-Reduce::Reduce(FFModel &model,
-               ReduceParams const &params,
-               const ParallelTensor input,
-               char const *name)
-    : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) {
+static DeviceSpecific<ReducePerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
+  auto reduce_desc = acc.get_argument<ffReduceTensorDescriptor_t>(REDUCE); //Note: this may has some problem, I should ask Colin 
+  OperatorType = attrs.op_type;
+  //Note: How to set the reduction size?
+  size_t reduction_size
+  DeviceSpecific<ReducePerDeviceState> per_device_state = acc.create_device_specific<ReducePerDeviceState>(init_kernel(handle, input, output, reduce_desc, op_type, reduction_size));
+  return per_device_state;
 }
 
-Reduce::Reduce(FFModel &model,
-               OperatorType _op_type,
-               const ParallelTensor input,
-               std::vector<int> const &_axes,
-               bool _keepdims,
-               char const *name)
-    : Op(model,
-         _op_type,
-         input->data_type,
-         name,
-         1 /*inputs*/,
-         0 /*weights*/,
-         1 /*outputs*/,
-         input),
-      num_axes(_axes.size()), keepdims(_keepdims) {
-  for (size_t i = 0; i < num_axes; i++) {
-    axes[i] = _axes[i];
-  }
-  int num_dims = input->num_dims;
-  ParallelDim dims[MAX_TENSOR_DIM];
-  if (keepdims) {
-    num_dims = input->num_dims;
-    for (int i = 0; i < num_dims; i++) {
-      dims[i] = input->dims[i];
-    }
-    for (int i = 0; i < num_axes; i++) {
-      // Currently assume that we cannot parallelize along reduced dims
-      assert(dims[axes[i]].degree == 1);
-      dims[axes[i]].size = 1;
-    }
-  } else {
-    num_dims = 0;
-    for (int i = 0; i < input->num_dims; i++) {
-      bool reduced = false;
-      for (int j = 0; j < num_axes; j++) {
-        if (axes[j] == i) {
-          reduced = true;
-        }
-      }
-      if (!reduced) {
-        dims[num_dims++] = input->dims[i];
-      } else {
-        // Currently assume that we cannot parallelize along reduced dims
-        assert(input->dims[i].degree == 1);
-        assert(input->dims[i].parallel_idx == -1);
-      }
-    }
-  }
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      num_dims, dims, input->data_type, this);
+static DeviceSpecific<TransposePerDeviceState>
+    init_task(Task const *task,
+              std::vector<PhysicalRegion> const &regions,
+              Context ctx,
+              Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  return init_task_impl(acc);
 }
 
-void Reduce::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(REDUCE_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(Reduce)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-};
-
-PerDeviceOpState *Reduce::init_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
-  Reduce *rd = (Reduce *)task->args;
-  FFHandler handle = *((FFHandler *)task->local_args);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  ReduceMeta *m = new ReduceMeta(handle, rd, input.domain);
-  return m;
-}
+plate <>
+void register_task<TRANSPOSE_INIT_TASK_ID>() {
+    OpTaskSignature init(OpTaskType::INIT)
 
-void Reduce::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(REDUCE_FWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, false),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
+    init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+    init.add_arg_slot<ReducePerDeviceState>(ATTRS);
 
-void Reduce::forward_task(Task const *task,
-                          std::vector<PhysicalRegion> const &regions,
-                          Context ctx,
-                          Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  ReduceMeta const *m = *((ReduceMeta **)task->local_args);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  forward_kernel_wrapper(m, input, output);
-}
+    init.add_input_slot(INPUT);
+    init.add_output_slot(OUTPUT);
+    //Note:how to add reduce? 
 
-void Reduce::backward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
-  IndexLauncher launcher(REDUCE_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  // regions[0](I): output_grad
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  // regions[1](I/O): input_grad
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
+    register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
+} 
 
-void Reduce::backward_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  ReduceMeta const *m = *((ReduceMeta **)task->local_args);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  backward_kernel_wrapper(m, output_grad, input_grad);
-}
-
-bool Reduce::measure_operator_cost(Simulator *sim,
-                                   MachineView const &mv,
-                                   CostMetrics &cost_metrics) const {
-  ParallelTensorBase sub_input, sub_output;
-  if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-    return false;
-  }
-  if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-    return false;
-  }
-  ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain());
-  sim->free_all();
-  float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-  assert(input_ptr != NULL);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorR input_acc(
-      inputs[0]->data_type, sub_input.get_domain(), input_ptr);
-
-  float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-  assert(output_ptr != NULL);
-  cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW output_acc(
-      outputs[0]->data_type, sub_output.get_domain(), output_ptr);
-
-  assert(m->profiling == false);
-
-  std::function<void()> forward, backward;
-  forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    float *input_grad_ptr =
-        (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-    assert(input_grad_ptr != NULL);
-    cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-    GenericTensorAccessorW input_grad_acc(
-        inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr);
-
-    float *output_grad_ptr =
-        (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-    assert(output_grad_ptr != NULL);
-    cost_metrics.outputs_memory +=
-        cost_metrics.total_mem_diff_from(sim->offset);
-    GenericTensorAccessorR output_grad_acc(
-        outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr);
-
-    backward = [&] {
-      backward_kernel_wrapper(m, output_grad_acc, input_grad_acc);
-    };
-  }
-
-  inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    printf("[Measure Reduce] name(%s) forward_time(%.4lf) "
-           "backward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time,
-           cost_metrics.backward_time);
-  } else {
-    printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time);
-  }
-
-  return true;
-}
-
-void Reduce::serialize(Legion::Serializer &sez) const {
-  ReduceParams params = get_params();
-  sez.serialize(params.op_type);
-  sez.serialize(params.axes.size());
-  for (size_t i = 0; i < params.axes.size(); i++) {
-    sez.serialize(params.axes[i]);
-  }
-  sez.serialize(params.keepdims);
-}
+OpTaskInvocation forward(ReduceAttrs const & attrs) {
+  OpTaskBinding binding;
 
-using PCG::Node;
-Node Reduce::deserialize(FFModel &ff,
-                         Legion::Deserializer &dez,
-                         ParallelTensor inputs[],
-                         int num_inputs) {
-  assert(num_inputs == 1);
-  OperatorType op_type;
-  size_t axes_size;
-  bool keepdims;
-  std::vector<int> axes;
-  dez.deserialize(op_type);
-  dez.deserialize(axes_size);
-  for (size_t i = 0; i < axes_size; i++) {
-    int dim_idx;
-    dez.deserialize(dim_idx);
-    axes.push_back(dim_idx);
-  }
-  dez.deserialize(keepdims);
-  return ff.get_or_create_node<Reduce>(inputs[0], {axes, op_type, keepdims});
-}
+  bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
+  bind.bind_arg(PROFILING, profiling_tensor());
 
-Op *Reduce::materialize(FFModel &ff,
-                        ParallelTensor inputs[],
-                        int num_inputs) const {
-  ReduceParams params = get_params();
-  return new Reduce(ff, params, inputs[0], this->name);
+  binding.bind(REDUCE, reduce_descriptor());//Note:this may have some problem, how to bind the ffReduceTensorDescriptor_t 
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 }
 
-}; // namespace FlexFlow
 
-namespace std {
-size_t hash<FlexFlow::ReduceParams>::operator()(
-    FlexFlow::ReduceParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.op_type);
-  hash_combine(key, params.axes.size());
-  for (int n : params.axes) {
-    hash_combine(key, n);
-  }
-  hash_combine(key, params.keepdims);
-  return key;
-}
+// Tensor FFModel::reduce_sum(OperatorType op,
+//                            const Tensor input,
+//                            std::vector<int> const &_axes,
+//                            bool keepdims,
+//                            char const *name) {
+//   Layer *rd = new Layer(this,
+//                         op,
+//                         DT_FLOAT,
+//                         name,
+//                         1 /*input*/,
+//                         0 /*weights*/,
+//                         1 /*outputs*/,
+//                         input);
+//   // Use Legion indexing to store axes
+//   std::vector<int> axes;
+//   for (size_t i = 0; i < _axes.size(); i++) {
+//     axes.push_back(input->num_dims - 1 - _axes[i]);
+//   }
+//   int dims[MAX_TENSOR_DIM];
+//   int numdim = input->num_dims;
+//   if (keepdims) {
+//     for (int i = 0; i < input->num_dims; i++) {
+//       dims[i] = input->dims[i];
+//     }
+//     for (size_t i = 0; i < axes.size(); i++) {
+//       dims[axes[i]] = 1;
+//     }
+//   } else {
+//     numdim = 0;
+//     for (int i = 0; i < input->num_dims; i++) {
+//       bool reduced = false;
+//       for (size_t j = 0; j < axes.size(); j++) {
+//         if (axes[j] == i) {
+//           reduced = true;
+//         }
+//       }
+//       if (!reduced) {
+//         dims[numdim++] = input->dims[i];
+//       }
+//     }
+//     assert(numdim + axes.size() == input->num_dims);
+//   }
+//   rd->outputs[0] = create_tensor_legion_ordering(
+//       numdim, dims, input->data_type, rd, 0, true /*create_grad*/);
+//   rd->add_int_vector_property("legion_axes", axes);
+//   rd->add_int_property("keepdims", keepdims);
+//   layers.push_back(rd);
+//   return rd->outputs[0];
+// }
+
+// Tensor FFModel::reduce_sum(const Tensor input,
+//                            std::vector<int> const &_axes,
+//                            bool keepdims,
+//                            char const *name) {
+//   return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name);
+// }
+
+// Tensor FFModel::reduce_mean(const Tensor input,
+//                             std::vector<int> const &_axes,
+//                             bool keepdims,
+//                             char const *name) {
+//   return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name);
+// }
+
+// Op *Reduce::create_operator_from_layer(
+//     FFModel &model,
+//     Layer const *layer,
+//     std::vector<ParallelTensor> const &inputs) {
+//   std::vector<int> axes;
+//   long long value;
+//   layer->get_int_vector_property("legion_axes", axes);
+//   layer->get_int_property("keepdims", value);
+//   bool keepdims = value;
+//   return new Reduce(
+//       model, layer->op_type, inputs[0], axes, keepdims, layer->name);
+// }
+
+// Reduce::Reduce(FFModel &model,
+//                ReduceParams const &params,
+//                const ParallelTensor input,
+//                char const *name)
+//     : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) {
+// }
+
+// Reduce::Reduce(FFModel &model,
+//                OperatorType _op_type,
+//                const ParallelTensor input,
+//                std::vector<int> const &_axes,
+//                bool _keepdims,
+//                char const *name)
+//     : Op(model,
+//          _op_type,
+//          input->data_type,
+//          name,
+//          1 /*inputs*/,
+//          0 /*weights*/,
+//          1 /*outputs*/,
+//          input),
+//       num_axes(_axes.size()), keepdims(_keepdims) {
+//   for (size_t i = 0; i < num_axes; i++) {
+//     axes[i] = _axes[i];
+//   }
+//   int num_dims = input->num_dims;
+//   ParallelDim dims[MAX_TENSOR_DIM];
+//   if (keepdims) {
+//     num_dims = input->num_dims;
+//     for (int i = 0; i < num_dims; i++) {
+//       dims[i] = input->dims[i];
+//     }
+//     for (int i = 0; i < num_axes; i++) {
+//       // Currently assume that we cannot parallelize along reduced dims
+//       assert(dims[axes[i]].degree == 1);
+//       dims[axes[i]].size = 1;
+//     }
+//   } else {
+//     num_dims = 0;
+//     for (int i = 0; i < input->num_dims; i++) {
+//       bool reduced = false;
+//       for (int j = 0; j < num_axes; j++) {
+//         if (axes[j] == i) {
+//           reduced = true;
+//         }
+//       }
+//       if (!reduced) {
+//         dims[num_dims++] = input->dims[i];
+//       } else {
+//         // Currently assume that we cannot parallelize along reduced dims
+//         assert(input->dims[i].degree == 1);
+//         assert(input->dims[i].parallel_idx == -1);
+//       }
+//     }
+//   }
+//   outputs[0] = model.create_parallel_tensor_legion_ordering(
+//       num_dims, dims, input->data_type, this);
+// }
+
+// void Reduce::init(FFModel const &ff) {
+//   assert(check_output_input_weight_same_parallel_is());
+//   parallel_is = outputs[0]->parallel_is;
+//   ArgumentMap argmap;
+//   Context ctx = ff.config.lg_ctx;
+//   Runtime *runtime = ff.config.lg_hlr;
+//   set_argumentmap_for_init(ff, argmap);
+//   IndexLauncher launcher(REDUCE_INIT_TASK_ID,
+//                          parallel_is,
+//                          TaskArgument(this, sizeof(Reduce)),
+//                          argmap,
+//                          Predicate::TRUE_PRED,
+//                          false /*must*/,
+//                          0 /*mapper_id*/,
+//                          outputs[0]->machine_view.hash());
+//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     READ_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     inputs[0]->region));
+//   launcher.add_field(0, FID_DATA);
+//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     WRITE_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     outputs[0]->region));
+//   launcher.add_field(1, FID_DATA);
+//   FutureMap fm = runtime->execute_index_space(ctx, launcher);
+//   fm.wait_all_results();
+//   set_opmeta_from_futuremap(ff, fm);
+// };
+
+// PerDeviceOpState *Reduce::init_task(Task const *task,
+//                                     std::vector<PhysicalRegion> const &regions,
+//                                     Context ctx,
+//                                     Runtime *runtime) {
+//   Reduce *rd = (Reduce *)task->args;
+//   FFHandler handle = *((FFHandler *)task->local_args);
+//   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+//       DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+//       DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//   ReduceMeta *m = new ReduceMeta(handle, rd, input.domain);
+//   return m;
+// }
+
+// void Reduce::forward(FFModel const &ff) {
+//   ArgumentMap argmap;
+//   Context ctx = ff.config.lg_ctx;
+//   Runtime *runtime = ff.config.lg_hlr;
+//   set_argumentmap_for_forward(ff, argmap);
+//   IndexLauncher launcher(REDUCE_FWD_TASK_ID,
+//                          parallel_is,
+//                          TaskArgument(nullptr, false),
+//                          argmap,
+//                          Predicate::TRUE_PRED,
+//                          false /*must*/,
+//                          0 /*mapper_id*/,
+//                          outputs[0]->machine_view.hash());
+//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     READ_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     inputs[0]->region));
+//   launcher.add_field(0, FID_DATA);
+//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     WRITE_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     outputs[0]->region));
+//   launcher.add_field(1, FID_DATA);
+//   runtime->execute_index_space(ctx, launcher);
+// }
+
+// void Reduce::forward_task(Task const *task,
+//                           std::vector<PhysicalRegion> const &regions,
+//                           Context ctx,
+//                           Runtime *runtime) {
+//   assert(regions.size() == 2);
+//   assert(task->regions.size() == 2);
+//   ReduceMeta const *m = *((ReduceMeta **)task->local_args);
+//   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+//       DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+//       DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+//   forward_kernel_wrapper(m, input, output);
+// }
+
+// void Reduce::backward(FFModel const &ff) {
+//   ArgumentMap argmap;
+//   Context ctx = ff.config.lg_ctx;
+//   Runtime *runtime = ff.config.lg_hlr;
+//   set_argumentmap_for_backward(ff, argmap);
+//   IndexLauncher launcher(REDUCE_BWD_TASK_ID,
+//                          parallel_is,
+//                          TaskArgument(nullptr, 0),
+//                          argmap,
+//                          Predicate::TRUE_PRED,
+//                          false /*must*/,
+//                          0 /*mapper_id*/,
+//                          outputs[0]->machine_view.hash());
+//   // regions[0](I): output_grad
+//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+//                                                     0 /*projection id*/,
+//                                                     READ_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     outputs[0]->region_grad));
+//   launcher.add_field(0, FID_DATA);
+//   // regions[1](I/O): input_grad
+//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+//                                                     0 /*projection id*/,
+//                                                     READ_WRITE,
+//                                                     EXCLUSIVE,
+//                                                     inputs[0]->region_grad));
+//   launcher.add_field(1, FID_DATA);
+//   runtime->execute_index_space(ctx, launcher);
+// }
+
+// void Reduce::backward_task(Task const *task,
+//                            std::vector<PhysicalRegion> const &regions,
+//                            Context ctx,
+//                            Runtime *runtime) {
+//   assert(regions.size() == 2);
+//   assert(task->regions.size() == 2);
+//   ReduceMeta const *m = *((ReduceMeta **)task->local_args);
+//   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+//       DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+//       DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//   backward_kernel_wrapper(m, output_grad, input_grad);
+// }
+
+// bool Reduce::measure_operator_cost(Simulator *sim,
+//                                    MachineView const &mv,
+//                                    CostMetrics &cost_metrics) const {
+//   ParallelTensorBase sub_input, sub_output;
+//   if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
+//     return false;
+//   }
+//   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
+//     return false;
+//   }
+//   ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain());
+//   sim->free_all();
+//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+//   assert(input_ptr != NULL);
+//   cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+//   GenericTensorAccessorR input_acc(
+//       inputs[0]->data_type, sub_input.get_domain(), input_ptr);
+
+//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+//   assert(output_ptr != NULL);
+//   cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+//   GenericTensorAccessorW output_acc(
+//       outputs[0]->data_type, sub_output.get_domain(), output_ptr);
+
+//   assert(m->profiling == false);
+
+//   std::function<void()> forward, backward;
+//   forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
+//   if (sim->computationMode == COMP_MODE_TRAINING) {
+//     float *input_grad_ptr =
+//         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+//     assert(input_grad_ptr != NULL);
+//     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+//     GenericTensorAccessorW input_grad_acc(
+//         inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr);
+
+//     float *output_grad_ptr =
+//         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+//     assert(output_grad_ptr != NULL);
+//     cost_metrics.outputs_memory +=
+//         cost_metrics.total_mem_diff_from(sim->offset);
+//     GenericTensorAccessorR output_grad_acc(
+//         outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr);
+
+//     backward = [&] {
+//       backward_kernel_wrapper(m, output_grad_acc, input_grad_acc);
+//     };
+//   }
+
+//   inner_measure_operator_cost(sim, forward, backward, cost_metrics);
+
+//   if (sim->computationMode == COMP_MODE_TRAINING) {
+//     printf("[Measure Reduce] name(%s) forward_time(%.4lf) "
+//            "backward_time(%.4lf)\n",
+//            name,
+//            cost_metrics.forward_time,
+//            cost_metrics.backward_time);
+//   } else {
+//     printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n",
+//            name,
+//            cost_metrics.forward_time);
+//   }
+
+//   return true;
+// }
+
+// void Reduce::serialize(Legion::Serializer &sez) const {
+//   ReduceParams params = get_params();
+//   sez.serialize(params.op_type);
+//   sez.serialize(params.axes.size());
+//   for (size_t i = 0; i < params.axes.size(); i++) {
+//     sez.serialize(params.axes[i]);
+//   }
+//   sez.serialize(params.keepdims);
+// }
+
+// using PCG::Node;
+// Node Reduce::deserialize(FFModel &ff,
+//                          Legion::Deserializer &dez,
+//                          ParallelTensor inputs[],
+//                          int num_inputs) {
+//   assert(num_inputs == 1);
+//   OperatorType op_type;
+//   size_t axes_size;
+//   bool keepdims;
+//   std::vector<int> axes;
+//   dez.deserialize(op_type);
+//   dez.deserialize(axes_size);
+//   for (size_t i = 0; i < axes_size; i++) {
+//     int dim_idx;
+//     dez.deserialize(dim_idx);
+//     axes.push_back(dim_idx);
+//   }
+//   dez.deserialize(keepdims);
+//   return ff.get_or_create_node<Reduce>(inputs[0], {axes, op_type, keepdims});
+// }
+
+// Op *Reduce::materialize(FFModel &ff,
+//                         ParallelTensor inputs[],
+//                         int num_inputs) const {
+//   ReduceParams params = get_params();
+//   return new Reduce(ff, params, inputs[0], this->name);
+// }
+
+// }; // namespace FlexFlow
+
+// namespace std {
+// size_t hash<FlexFlow::ReduceParams>::operator()(
+//     FlexFlow::ReduceParams const &params) const {
+//   size_t key = 0;
+//   hash_combine(key, params.op_type);
+//   hash_combine(key, params.axes.size());
+//   for (int n : params.axes) {
+//     hash_combine(key, n);
+//   }
+//   hash_combine(key, params.keepdims);
+//   return key;
+// }
 }; // namespace std
diff --git a/lib/runtime/src/ops/reduce.h b/lib/runtime/src/ops/reduce.h
index 52b4eff609..fd3df85281 100644
--- a/lib/runtime/src/ops/reduce.h
+++ b/lib/runtime/src/ops/reduce.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
 #include "op-attrs/ops/reduce.h"
-#include "op_task_invocation.h"
+#include "task_spec/op_task_invocation.h"
 #include "sim_environment.h"
 
 namespace FlexFlow {
@@ -20,7 +20,7 @@ OpTaskInvocation backward(ReduceAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReduceAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 

From 356cdde52eb352659d94a756f5b211c5f0e33103 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 8 Sep 2023 06:18:52 +0000
Subject: [PATCH 03/13] add forward API

---
 lib/runtime/src/ops/reduce.cc | 46 ++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 5270006bc7..1bed6ee959 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -55,19 +55,17 @@ OpTaskInvocation init(TransposeAttrs const &attrs) {
   binding.bind_arg(HANDLE, ff_handle());.
   binding.bind_arg(ATTRS, attrs);
 
-  bind.bind(REDUCE, reduce_descriptor());//Note: how to bind ffReduceTensorDescriptor_t ?
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  return {REDUCE_INIT_TASK_ID, binding};
 }
 
 static DeviceSpecific<ReducePerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
-  auto reduce_desc = acc.get_argument<ffReduceTensorDescriptor_t>(REDUCE); //Note: this may has some problem, I should ask Colin 
   OperatorType = attrs.op_type;
   //Note: How to set the reduction size?
+  ffTensorDescriptor_t inputTensor;
+  ffTensorDescriptor_t outputTensor;
+  ffReduceTensorDescriptor_t reduceDesc;
   size_t reduction_size
   DeviceSpecific<ReducePerDeviceState> per_device_state = acc.create_device_specific<ReducePerDeviceState>(init_kernel(handle, input, output, reduce_desc, op_type, reduction_size));
   return per_device_state;
@@ -87,24 +85,48 @@ void register_task<TRANSPOSE_INIT_TASK_ID>() {
     OpTaskSignature init(OpTaskType::INIT)
 
     init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-    init.add_arg_slot<ReducePerDeviceState>(ATTRS);
-
-    init.add_input_slot(INPUT);
-    init.add_output_slot(OUTPUT);
-    //Note:how to add reduce? 
+    init.add_arg_slot<ReduceAttrs>(ATTRS);
 
     register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
 } 
 
+//Note: forward_kernel only needs ReducePerDeviceState, input, output
 OpTaskInvocation forward(ReduceAttrs const & attrs) {
   OpTaskBinding binding;
 
   bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
   bind.bind_arg(PROFILING, profiling_tensor());
 
-  binding.bind(REDUCE, reduce_descriptor());//Note:this may have some problem, how to bind the ffReduceTensorDescriptor_t 
   binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
+
+  return {REDUCE_FWD_TASK_ID, binding};
+}
+
+
+static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  NOT_IMPLEMENTED();
+}
+
+static void forward_task(Task const *task,
+                         std::vector<PhysicalRegion> const &regions,
+                         Context ctx,
+                         Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  forward_task_impl(acc);
+}
+
+template <>
+void register_task<REDUCE_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FORWARD);
+
+  fwd.add_unchecked_arg_slot<PerDeviceOpState>(PER_DEVICE_STATE);
+  fwd.add_arg_slot<ProfilingTensor>(PROFILING);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+
+  register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task);
 }
 
 

From c45a318995d64f8ec7de1f481715ce9d5a679a22 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 8 Sep 2023 06:21:57 +0000
Subject: [PATCH 04/13] add backward API

---
 lib/runtime/src/ops/reduce.cc | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 1bed6ee959..f5801fa584 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -129,6 +129,30 @@ void register_task<REDUCE_FWD_TASK_ID>() {
   register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task);
 }
 
+OpTaskInvocation backward(ReduceAttrs const & attrs) {
+  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
+
+  return {REDUCE_BWD_TASK_ID, binding};
+}
+
+static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+  NOT_IMPLEMENTED();
+}
+
+static void backward_task(Task const *task,
+                          std::vector<PhysicalRegion> const &regions,
+                          Context ctx,
+                          Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  backward_task_impl(acc);
+}
+
+template <>
+void register_task<REDUCE_BWD_TASK_ID>() {
+    OpTaskSignature bwd = infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
+
+    reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
+}
 
 // Tensor FFModel::reduce_sum(OperatorType op,
 //                            const Tensor input,

From 074f1fe50adf74acc1014d8bfeef042d20c3dd20 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 8 Sep 2023 06:33:53 +0000
Subject: [PATCH 05/13] implement measure_operator_cost and Reduce version 0.1

---
 lib/runtime/src/ops/reduce.cc | 60 +++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index f5801fa584..ea56c86268 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -4,6 +4,7 @@
 #include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
 #include "op-attrs/get_output_shape.h"
+#include "utils/type_traits_core.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -105,7 +106,18 @@ OpTaskInvocation forward(ReduceAttrs const & attrs) {
 
 
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  NOT_IMPLEMENTED();
+  auto per_device_state = acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[Reduce] forward_time = %.2lfms\n",
+                 &per_device_state,
+                 input.get_float_ptr(),
+                  output.get_float_ptr());
 }
 
 static void forward_task(Task const *task,
@@ -136,7 +148,18 @@ OpTaskInvocation backward(ReduceAttrs const & attrs) {
 }
 
 static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
-  NOT_IMPLEMENTED();
+  auto per_device_state = acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reduce] backward_time = %.2lfms\n",
+                 &per_device_state,
+                 input.get_float_ptr(),
+                  output.get_float_ptr());
 }
 
 static void backward_task(Task const *task,
@@ -154,6 +177,39 @@ void register_task<REDUCE_BWD_TASK_ID>() {
     reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
 }
 
+CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
+                                  ReduceAttrs const &attrs,
+                                  InputParallelTensorDesc const &input,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &machine_view) {
+    auto env = sim.new_environment();
+
+    SimTaskBinding init_binding;
+    init_binding.bind_arg(ATTRS, attrs);  
+    binding.bind_arg(HANDLE, ff_handle());
+
+    auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding);
+    DeviceSpecific<ReducePerDeviceState> per_device_state = init_task_impl(init_accessor);
+
+    SimTaskBinding fwd_binding;
+    ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+    fwd.bind(INPUT, input);
+    fwd.bind(OUTPUT, output_shape);
+    fwd.bind_arg(PROFILING, settings);
+    fwd.bind_arg(PER_DEVICE_STATE, per_device_state);
+
+    SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+
+    auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding);
+    auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding);
+
+    float forward_time = forward_task_impl(fwd_accessor).value();
+    float backward_time = backward_task_impl(bwd_accessor).value();
+
+    float sync_time = default_estimate_sync_time(env);
+    return make_metrics(forward_time, backward_time, sync_time, env);
+  }
+
 // Tensor FFModel::reduce_sum(OperatorType op,
 //                            const Tensor input,
 //                            std::vector<int> const &_axes,

From db204dd75e2bf1dc6d1e74083bafa134f56f36a4 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Fri, 8 Sep 2023 06:34:10 +0000
Subject: [PATCH 06/13] format the code

---
 lib/kernels/include/kernels/reduce_kernels.h |  19 ++-
 lib/kernels/src/cuda/reduce_kernels.cu       |  15 ++-
 lib/runtime/src/ops/reduce.cc                | 134 ++++++++++---------
 lib/runtime/src/ops/reduce.h                 |   2 +-
 4 files changed, 95 insertions(+), 75 deletions(-)

diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 99251ee6dd..7d7f2fc7f0 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -14,18 +14,23 @@ struct ReducePerDeviceState {
   size_t reduction_size;
 };
 
-FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState, handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size);
-
+FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState,
+                          handle,
+                          inputTensor,
+                          outputTensor,
+                          reduceDesc,
+                          op_type,
+                          reduction_size);
 
 namespace Kernels {
 namespace Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFhandle const &,
-                                ffTensorDescriptor_t const &,
-                                ffTensorDescriptor_t const &,
-                                ffReduceTensorDescriptor_t const &,
-                                OperatorType const &,
-                                size_t const &);
+                                 ffTensorDescriptor_t const &,
+                                 ffTensorDescriptor_t const &,
+                                 ffReduceTensorDescriptor_t const &,
+                                 OperatorType const &,
+                                 size_t const &);
 
 void forward_kernel_wrapper(ReducePerDeviceState const *m,
                             GenericTensorAccessorR const &input,
diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu
index 245b0d06e5..87e3be991c 100644
--- a/lib/kernels/src/cuda/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/reduce_kernels.cu
@@ -67,13 +67,14 @@ ReducePerDeviceState::~ReducePerDeviceState(void) {
 namespace Kernels {
 namespace Reduce {
 
-ReducePerDeviceState init_kernel(PerDeviceFFhandle const & handle,
-    ffTensorDescriptor_t const & input_tensor,
-    ffTensorDescriptor_t const & outputTensor,
-    ffReduceTensorDescriptor_t const & reduceDesc,
-    OperatorType const & op_type,
-    size_t const &  reduction_size) {
-      return {handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size};
+ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle,
+                                 ffTensorDescriptor_t const &input_tensor,
+                                 ffTensorDescriptor_t const &outputTensor,
+                                 ffReduceTensorDescriptor_t const &reduceDesc,
+                                 OperatorType const &op_type,
+                                 size_t const &reduction_size) {
+  return {
+      handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size};
 }
 
 void forward_kernel(cudaStream_t stream,
diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index ea56c86268..c06a75c7da 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -1,9 +1,9 @@
 #include "reduce.h"
 #include "kernels/reduce_kernels.h"
 #include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shape.h"
 #include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
-#include "op-attrs/get_output_shape.h"
 #include "utils/type_traits_core.h"
 
 namespace FlexFlow {
@@ -48,27 +48,37 @@ ReduceParams Reduce::get_params() const {
   return params;
 }
 
-enum Slots {INPUT, OUTPUT, ATTRS, PROFILING, REDUCE, PER_DEVICE_STATE, HANDLE};
+enum Slots {
+  INPUT,
+  OUTPUT,
+  ATTRS,
+  PROFILING,
+  REDUCE,
+  PER_DEVICE_STATE,
+  HANDLE
+};
 
 OpTaskInvocation init(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind_arg(HANDLE, ff_handle());.
-  binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
+  .binding.bind_arg(ATTRS, attrs);
 
   return {REDUCE_INIT_TASK_ID, binding};
 }
 
-static DeviceSpecific<ReducePerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+static DeviceSpecific<ReducePerDeviceState>
+    init_task_impl(TaskArgumentAccessor const &acc) {
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
   OperatorType = attrs.op_type;
-  //Note: How to set the reduction size?
+  // Note: How to set the reduction size?
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
   ffReduceTensorDescriptor_t reduceDesc;
-  size_t reduction_size
-  DeviceSpecific<ReducePerDeviceState> per_device_state = acc.create_device_specific<ReducePerDeviceState>(init_kernel(handle, input, output, reduce_desc, op_type, reduction_size));
+  size_t reduction_size DeviceSpecific<ReducePerDeviceState> per_device_state =
+      acc.create_device_specific<ReducePerDeviceState>(init_kernel(
+          handle, input, output, reduce_desc, op_type, reduction_size));
   return per_device_state;
 }
 
@@ -81,18 +91,17 @@ static DeviceSpecific<TransposePerDeviceState>
   return init_task_impl(acc);
 }
 
-plate <>
-void register_task<TRANSPOSE_INIT_TASK_ID>() {
-    OpTaskSignature init(OpTaskType::INIT)
+plate<> void register_task<TRANSPOSE_INIT_TASK_ID>() {
+  OpTaskSignature init(OpTaskType::INIT)
 
-    init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-    init.add_arg_slot<ReduceAttrs>(ATTRS);
+      init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<ReduceAttrs>(ATTRS);
 
-    register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
-} 
+  register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
+}
 
-//Note: forward_kernel only needs ReducePerDeviceState, input, output
-OpTaskInvocation forward(ReduceAttrs const & attrs) {
+// Note: forward_kernel only needs ReducePerDeviceState, input, output
+OpTaskInvocation forward(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
   bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
@@ -104,9 +113,9 @@ OpTaskInvocation forward(ReduceAttrs const & attrs) {
   return {REDUCE_FWD_TASK_ID, binding};
 }
 
-
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state = acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
+  auto per_device_state =
+      acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -117,7 +126,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "[Reduce] forward_time = %.2lfms\n",
                  &per_device_state,
                  input.get_float_ptr(),
-                  output.get_float_ptr());
+                 output.get_float_ptr());
 }
 
 static void forward_task(Task const *task,
@@ -141,14 +150,15 @@ void register_task<REDUCE_FWD_TASK_ID>() {
   register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task);
 }
 
-OpTaskInvocation backward(ReduceAttrs const & attrs) {
+OpTaskInvocation backward(ReduceAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
   return {REDUCE_BWD_TASK_ID, binding};
 }
 
 static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state = acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
+  auto per_device_state =
+      acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
@@ -159,7 +169,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
                  "[Reduce] backward_time = %.2lfms\n",
                  &per_device_state,
                  input.get_float_ptr(),
-                  output.get_float_ptr());
+                 output.get_float_ptr());
 }
 
 static void backward_task(Task const *task,
@@ -172,9 +182,10 @@ static void backward_task(Task const *task,
 
 template <>
 void register_task<REDUCE_BWD_TASK_ID>() {
-    OpTaskSignature bwd = infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
+  OpTaskSignature bwd =
+      infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
 
-    reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
+  reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
 }
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
@@ -182,33 +193,34 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
-    auto env = sim.new_environment();
+  auto env = sim.new_environment();
 
-    SimTaskBinding init_binding;
-    init_binding.bind_arg(ATTRS, attrs);  
-    binding.bind_arg(HANDLE, ff_handle());
+  SimTaskBinding init_binding;
+  init_binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
 
-    auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding);
-    DeviceSpecific<ReducePerDeviceState> per_device_state = init_task_impl(init_accessor);
+  auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding);
+  DeviceSpecific<ReducePerDeviceState> per_device_state =
+      init_task_impl(init_accessor);
 
-    SimTaskBinding fwd_binding;
-    ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
-    fwd.bind(INPUT, input);
-    fwd.bind(OUTPUT, output_shape);
-    fwd.bind_arg(PROFILING, settings);
-    fwd.bind_arg(PER_DEVICE_STATE, per_device_state);
+  SimTaskBinding fwd_binding;
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+  fwd.bind(INPUT, input);
+  fwd.bind(OUTPUT, output_shape);
+  fwd.bind_arg(PROFILING, settings);
+  fwd.bind_arg(PER_DEVICE_STATE, per_device_state);
 
-    SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+  SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-    auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding);
-    auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding);
 
-    float forward_time = forward_task_impl(fwd_accessor).value();
-    float backward_time = backward_task_impl(bwd_accessor).value();
+  float forward_time = forward_task_impl(fwd_accessor).value();
+  float backward_time = backward_task_impl(bwd_accessor).value();
 
-    float sync_time = default_estimate_sync_time(env);
-    return make_metrics(forward_time, backward_time, sync_time, env);
-  }
+  float sync_time = default_estimate_sync_time(env);
+  return make_metrics(forward_time, backward_time, sync_time, env);
+}
 
 // Tensor FFModel::reduce_sum(OperatorType op,
 //                            const Tensor input,
@@ -291,7 +303,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 //                ReduceParams const &params,
 //                const ParallelTensor input,
 //                char const *name)
-//     : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) {
+//     : Reduce(model, params.op_type, input, params.axes, params.keepdims,
+//     name) {
 // }
 
 // Reduce::Reduce(FFModel &model,
@@ -379,9 +392,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 // };
 
 // PerDeviceOpState *Reduce::init_task(Task const *task,
-//                                     std::vector<PhysicalRegion> const &regions,
-//                                     Context ctx,
-//                                     Runtime *runtime) {
+//                                     std::vector<PhysicalRegion> const
+//                                     &regions, Context ctx, Runtime *runtime)
+//                                     {
 //   Reduce *rd = (Reduce *)task->args;
 //   FFHandler handle = *((FFHandler *)task->local_args);
 //   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
@@ -491,16 +504,16 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 //   }
 //   ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain());
 //   sim->free_all();
-//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-//   assert(input_ptr != NULL);
-//   cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-//   GenericTensorAccessorR input_acc(
+//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(),
+//   DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory +=
+//   cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorR
+//   input_acc(
 //       inputs[0]->data_type, sub_input.get_domain(), input_ptr);
 
-//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-//   assert(output_ptr != NULL);
-//   cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-//   GenericTensorAccessorW output_acc(
+//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(),
+//   DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory +=
+//   cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW
+//   output_acc(
 //       outputs[0]->data_type, sub_output.get_domain(), output_ptr);
 
 //   assert(m->profiling == false);
@@ -511,8 +524,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 //     float *input_grad_ptr =
 //         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
 //     assert(input_grad_ptr != NULL);
-//     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-//     GenericTensorAccessorW input_grad_acc(
+//     cost_metrics.inputs_memory +=
+//     cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW
+//     input_grad_acc(
 //         inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr);
 
 //     float *output_grad_ptr =
@@ -597,4 +611,4 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 //   hash_combine(key, params.keepdims);
 //   return key;
 // }
-}; // namespace std
+}; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reduce.h b/lib/runtime/src/ops/reduce.h
index fd3df85281..099083ed67 100644
--- a/lib/runtime/src/ops/reduce.h
+++ b/lib/runtime/src/ops/reduce.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
 #include "op-attrs/ops/reduce.h"
-#include "task_spec/op_task_invocation.h"
 #include "sim_environment.h"
+#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 

From bf95bc45170edc11c33ca8ffa413511316c62395 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 24 Sep 2023 21:33:40 +0000
Subject: [PATCH 07/13] update the redudce

---
 lib/kernels/include/kernels/reduce_kernels.h | 13 +++--
 lib/kernels/src/cuda/reduce_kernels.cu       | 53 ++++++++++++--------
 lib/runtime/src/ops/reduce.cc                | 24 +++++----
 3 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 7d7f2fc7f0..9db7bd9aea 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -14,7 +14,7 @@ struct ReducePerDeviceState {
   size_t reduction_size;
 };
 
-FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState,
+FF_VISITABLE_STRUCT(ReducePerDeviceState,
                           handle,
                           inputTensor,
                           outputTensor,
@@ -26,17 +26,16 @@ namespace Kernels {
 namespace Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFhandle const &,
-                                 ffTensorDescriptor_t const &,
-                                 ffTensorDescriptor_t const &,
-                                 ffReduceTensorDescriptor_t const &,
                                  OperatorType const &,
-                                 size_t const &);
+                                 size_t const &,
+                                 ArrayShape input_shape,
+                                 ArrayShape output_shape);
 
-void forward_kernel_wrapper(ReducePerDeviceState const *m,
+void forward_kernel_wrapper(ReducePerDeviceState const &m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
 
-void backward_kernel_wrapper(ReducePerDeviceState const *m,
+void backward_kernel_wrapper(ReducePerDeviceState const &m,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorW const &input_grad);
 
diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu
index 87e3be991c..b7a5c824f2 100644
--- a/lib/kernels/src/cuda/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/reduce_kernels.cu
@@ -68,59 +68,70 @@ namespace Kernels {
 namespace Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle,
-                                 ffTensorDescriptor_t const &input_tensor,
-                                 ffTensorDescriptor_t const &outputTensor,
-                                 ffReduceTensorDescriptor_t const &reduceDesc,
                                  OperatorType const &op_type,
-                                 size_t const &reduction_size) {
-  return {
-      handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size};
+                                 size_t const &reduction_size,
+                                 ArrayShape const & input_shape,
+                                 ArrayShape const & output_shape) {
+  
+  ffTensorDescriptor_t inputTensor;
+  ffTensorDescriptor_t outputTensor;
+  ffReduceTensorDescriptor_t reduceDesc;
+
+  checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));;
+  checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+
+  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape));
+  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
+
+  ReducePerDeviceState per_device = {handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size};
+
 }
 
 void forward_kernel(cudaStream_t stream,
-                    ReducePerDeviceState const *m,
+                    ReducePerDeviceState const &m,
                     float const *input_ptr,
                     float *output_ptr) {
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0f, beta = 0.0f;
-  checkCUDNN(cudnnReduceTensor(m->handle.dnn,
-                               m->reduceDesc,
+  checkCUDNN(cudnnReduceTensor(m.handle.dnn,
+                               m.reduceDesc,
                                nullptr /*indices*/,
                                0 /*indicesSizeInBytes*/,
-                               m->handle.workSpace,
-                               m->handle.workSpaceSize,
+                               m.handle.workSpace,
+                               m.handle.workSpaceSize,
                                &alpha,
-                               m->inputTensor,
+                               m.inputTensor,
                                input_ptr,
                                &beta,
-                               m->outputTensor,
+                               m.outputTensor,
                                output_ptr));
 };
 
 void backward_kernel(cudaStream_t stream,
-                     ReducePerDeviceState const *m,
+                     ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr) {
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0, beta = 1.0f;
-  switch (m->op_type) {
+  switch (m.op_type) {
     case OP_REDUCE_SUM:
       alpha = 1.0f;
       break;
     case OP_REDUCE_MEAN:
       // When the output is the average of multiple input elements
       // we need to scale the gradients by 1.0 / reduction_size
-      alpha = 1.0f / m->reduction_size;
+      alpha = 1.0f / m.reduction_size;
       break;
     default:
       assert(false);
   }
-  checkCUDNN(cudnnAddTensor(m->handle.dnn,
+  checkCUDNN(cudnnAddTensor(m.handle.dnn,
                             &alpha,
-                            m->outputTensor,
+                            m.outputTensor,
                             output_grad_ptr,
                             &beta,
-                            m->inputTensor,
+                            m.inputTensor,
                             input_grad_ptr));
 }
 
diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index c06a75c7da..59ce5f9973 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -62,7 +62,10 @@ OpTaskInvocation init(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(HANDLE, ff_handle());
-  .binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(ATTRS, attrs);
+
+  binding.bind(INPUT, input_parallel_tensor_shape(0));
+  binding.bind(OUTPUT, output_parallel_tensor_shape(0));
 
   return {REDUCE_INIT_TASK_ID, binding};
 }
@@ -71,14 +74,15 @@ static DeviceSpecific<ReducePerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
   OperatorType = attrs.op_type;
   // Note: How to set the reduction size?
-  ffTensorDescriptor_t inputTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffReduceTensorDescriptor_t reduceDesc;
-  size_t reduction_size DeviceSpecific<ReducePerDeviceState> per_device_state =
+  size_t reduction_size ;
+  DeviceSpecific<ReducePerDeviceState> per_device_state =
       acc.create_device_specific<ReducePerDeviceState>(init_kernel(
-          handle, input, output, reduce_desc, op_type, reduction_size));
+          handle, op_type, reduction_size, input.shape, output.shape));
   return per_device_state;
 }
 
@@ -107,7 +111,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) {
   bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
   bind.bind_arg(PROFILING, profiling_tensor());
 
-  binding.bind(INPUT, input_tensor(0));
+  binding.bind(INPUT, input_parallel_tensor_shape(0));
   binding.bind(OUTPUT, output_tensor(0));
 
   return {REDUCE_FWD_TASK_ID, binding};
@@ -124,7 +128,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(forward_kernel,
                  profiling,
                  "[Reduce] forward_time = %.2lfms\n",
-                 &per_device_state,
+                 per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
 }
@@ -142,7 +146,7 @@ void register_task<REDUCE_FWD_TASK_ID>() {
   OpTaskSignature fwd(OpTaskType::FORWARD);
 
   fwd.add_unchecked_arg_slot<PerDeviceOpState>(PER_DEVICE_STATE);
-  fwd.add_arg_slot<ProfilingTensor>(PROFILING);
+  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
@@ -167,7 +171,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(backward_kernel,
                  profiling,
                  "[Reduce] backward_time = %.2lfms\n",
-                 &per_device_state,
+                 per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
 }

From be032a196ee851e14a90d0811a4c9b9e39fd4be2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 24 Sep 2023 21:34:27 +0000
Subject: [PATCH 08/13] update the redudce

---
 lib/runtime/src/ops/reduce.cc | 389 ----------------------------------
 1 file changed, 389 deletions(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 59ce5f9973..bfcf64c186 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -226,393 +226,4 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   return make_metrics(forward_time, backward_time, sync_time, env);
 }
 
-// Tensor FFModel::reduce_sum(OperatorType op,
-//                            const Tensor input,
-//                            std::vector<int> const &_axes,
-//                            bool keepdims,
-//                            char const *name) {
-//   Layer *rd = new Layer(this,
-//                         op,
-//                         DT_FLOAT,
-//                         name,
-//                         1 /*input*/,
-//                         0 /*weights*/,
-//                         1 /*outputs*/,
-//                         input);
-//   // Use Legion indexing to store axes
-//   std::vector<int> axes;
-//   for (size_t i = 0; i < _axes.size(); i++) {
-//     axes.push_back(input->num_dims - 1 - _axes[i]);
-//   }
-//   int dims[MAX_TENSOR_DIM];
-//   int numdim = input->num_dims;
-//   if (keepdims) {
-//     for (int i = 0; i < input->num_dims; i++) {
-//       dims[i] = input->dims[i];
-//     }
-//     for (size_t i = 0; i < axes.size(); i++) {
-//       dims[axes[i]] = 1;
-//     }
-//   } else {
-//     numdim = 0;
-//     for (int i = 0; i < input->num_dims; i++) {
-//       bool reduced = false;
-//       for (size_t j = 0; j < axes.size(); j++) {
-//         if (axes[j] == i) {
-//           reduced = true;
-//         }
-//       }
-//       if (!reduced) {
-//         dims[numdim++] = input->dims[i];
-//       }
-//     }
-//     assert(numdim + axes.size() == input->num_dims);
-//   }
-//   rd->outputs[0] = create_tensor_legion_ordering(
-//       numdim, dims, input->data_type, rd, 0, true /*create_grad*/);
-//   rd->add_int_vector_property("legion_axes", axes);
-//   rd->add_int_property("keepdims", keepdims);
-//   layers.push_back(rd);
-//   return rd->outputs[0];
-// }
-
-// Tensor FFModel::reduce_sum(const Tensor input,
-//                            std::vector<int> const &_axes,
-//                            bool keepdims,
-//                            char const *name) {
-//   return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name);
-// }
-
-// Tensor FFModel::reduce_mean(const Tensor input,
-//                             std::vector<int> const &_axes,
-//                             bool keepdims,
-//                             char const *name) {
-//   return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name);
-// }
-
-// Op *Reduce::create_operator_from_layer(
-//     FFModel &model,
-//     Layer const *layer,
-//     std::vector<ParallelTensor> const &inputs) {
-//   std::vector<int> axes;
-//   long long value;
-//   layer->get_int_vector_property("legion_axes", axes);
-//   layer->get_int_property("keepdims", value);
-//   bool keepdims = value;
-//   return new Reduce(
-//       model, layer->op_type, inputs[0], axes, keepdims, layer->name);
-// }
-
-// Reduce::Reduce(FFModel &model,
-//                ReduceParams const &params,
-//                const ParallelTensor input,
-//                char const *name)
-//     : Reduce(model, params.op_type, input, params.axes, params.keepdims,
-//     name) {
-// }
-
-// Reduce::Reduce(FFModel &model,
-//                OperatorType _op_type,
-//                const ParallelTensor input,
-//                std::vector<int> const &_axes,
-//                bool _keepdims,
-//                char const *name)
-//     : Op(model,
-//          _op_type,
-//          input->data_type,
-//          name,
-//          1 /*inputs*/,
-//          0 /*weights*/,
-//          1 /*outputs*/,
-//          input),
-//       num_axes(_axes.size()), keepdims(_keepdims) {
-//   for (size_t i = 0; i < num_axes; i++) {
-//     axes[i] = _axes[i];
-//   }
-//   int num_dims = input->num_dims;
-//   ParallelDim dims[MAX_TENSOR_DIM];
-//   if (keepdims) {
-//     num_dims = input->num_dims;
-//     for (int i = 0; i < num_dims; i++) {
-//       dims[i] = input->dims[i];
-//     }
-//     for (int i = 0; i < num_axes; i++) {
-//       // Currently assume that we cannot parallelize along reduced dims
-//       assert(dims[axes[i]].degree == 1);
-//       dims[axes[i]].size = 1;
-//     }
-//   } else {
-//     num_dims = 0;
-//     for (int i = 0; i < input->num_dims; i++) {
-//       bool reduced = false;
-//       for (int j = 0; j < num_axes; j++) {
-//         if (axes[j] == i) {
-//           reduced = true;
-//         }
-//       }
-//       if (!reduced) {
-//         dims[num_dims++] = input->dims[i];
-//       } else {
-//         // Currently assume that we cannot parallelize along reduced dims
-//         assert(input->dims[i].degree == 1);
-//         assert(input->dims[i].parallel_idx == -1);
-//       }
-//     }
-//   }
-//   outputs[0] = model.create_parallel_tensor_legion_ordering(
-//       num_dims, dims, input->data_type, this);
-// }
-
-// void Reduce::init(FFModel const &ff) {
-//   assert(check_output_input_weight_same_parallel_is());
-//   parallel_is = outputs[0]->parallel_is;
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_init(ff, argmap);
-//   IndexLauncher launcher(REDUCE_INIT_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(this, sizeof(Reduce)),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(0, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     WRITE_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region));
-//   launcher.add_field(1, FID_DATA);
-//   FutureMap fm = runtime->execute_index_space(ctx, launcher);
-//   fm.wait_all_results();
-//   set_opmeta_from_futuremap(ff, fm);
-// };
-
-// PerDeviceOpState *Reduce::init_task(Task const *task,
-//                                     std::vector<PhysicalRegion> const
-//                                     &regions, Context ctx, Runtime *runtime)
-//                                     {
-//   Reduce *rd = (Reduce *)task->args;
-//   FFHandler handle = *((FFHandler *)task->local_args);
-//   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-//       DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-//       DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//   ReduceMeta *m = new ReduceMeta(handle, rd, input.domain);
-//   return m;
-// }
-
-// void Reduce::forward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_forward(ff, argmap);
-//   IndexLauncher launcher(REDUCE_FWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(nullptr, false),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(0, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     WRITE_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region));
-//   launcher.add_field(1, FID_DATA);
-//   runtime->execute_index_space(ctx, launcher);
-// }
-
-// void Reduce::forward_task(Task const *task,
-//                           std::vector<PhysicalRegion> const &regions,
-//                           Context ctx,
-//                           Runtime *runtime) {
-//   assert(regions.size() == 2);
-//   assert(task->regions.size() == 2);
-//   ReduceMeta const *m = *((ReduceMeta **)task->local_args);
-//   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-//       DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-//       DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-//   forward_kernel_wrapper(m, input, output);
-// }
-
-// void Reduce::backward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_backward(ff, argmap);
-//   IndexLauncher launcher(REDUCE_BWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(nullptr, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   // regions[0](I): output_grad
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region_grad));
-//   launcher.add_field(0, FID_DATA);
-//   // regions[1](I/O): input_grad
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_WRITE,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region_grad));
-//   launcher.add_field(1, FID_DATA);
-//   runtime->execute_index_space(ctx, launcher);
-// }
-
-// void Reduce::backward_task(Task const *task,
-//                            std::vector<PhysicalRegion> const &regions,
-//                            Context ctx,
-//                            Runtime *runtime) {
-//   assert(regions.size() == 2);
-//   assert(task->regions.size() == 2);
-//   ReduceMeta const *m = *((ReduceMeta **)task->local_args);
-//   GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-//       DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//   GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-//       DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//   backward_kernel_wrapper(m, output_grad, input_grad);
-// }
-
-// bool Reduce::measure_operator_cost(Simulator *sim,
-//                                    MachineView const &mv,
-//                                    CostMetrics &cost_metrics) const {
-//   ParallelTensorBase sub_input, sub_output;
-//   if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-//     return false;
-//   }
-//   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-//     return false;
-//   }
-//   ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain());
-//   sim->free_all();
-//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(),
-//   DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory +=
-//   cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorR
-//   input_acc(
-//       inputs[0]->data_type, sub_input.get_domain(), input_ptr);
-
-//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(),
-//   DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory +=
-//   cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW
-//   output_acc(
-//       outputs[0]->data_type, sub_output.get_domain(), output_ptr);
-
-//   assert(m->profiling == false);
-
-//   std::function<void()> forward, backward;
-//   forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
-//   if (sim->computationMode == COMP_MODE_TRAINING) {
-//     float *input_grad_ptr =
-//         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-//     assert(input_grad_ptr != NULL);
-//     cost_metrics.inputs_memory +=
-//     cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW
-//     input_grad_acc(
-//         inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr);
-
-//     float *output_grad_ptr =
-//         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-//     assert(output_grad_ptr != NULL);
-//     cost_metrics.outputs_memory +=
-//         cost_metrics.total_mem_diff_from(sim->offset);
-//     GenericTensorAccessorR output_grad_acc(
-//         outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr);
-
-//     backward = [&] {
-//       backward_kernel_wrapper(m, output_grad_acc, input_grad_acc);
-//     };
-//   }
-
-//   inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-//   if (sim->computationMode == COMP_MODE_TRAINING) {
-//     printf("[Measure Reduce] name(%s) forward_time(%.4lf) "
-//            "backward_time(%.4lf)\n",
-//            name,
-//            cost_metrics.forward_time,
-//            cost_metrics.backward_time);
-//   } else {
-//     printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n",
-//            name,
-//            cost_metrics.forward_time);
-//   }
-
-//   return true;
-// }
-
-// void Reduce::serialize(Legion::Serializer &sez) const {
-//   ReduceParams params = get_params();
-//   sez.serialize(params.op_type);
-//   sez.serialize(params.axes.size());
-//   for (size_t i = 0; i < params.axes.size(); i++) {
-//     sez.serialize(params.axes[i]);
-//   }
-//   sez.serialize(params.keepdims);
-// }
-
-// using PCG::Node;
-// Node Reduce::deserialize(FFModel &ff,
-//                          Legion::Deserializer &dez,
-//                          ParallelTensor inputs[],
-//                          int num_inputs) {
-//   assert(num_inputs == 1);
-//   OperatorType op_type;
-//   size_t axes_size;
-//   bool keepdims;
-//   std::vector<int> axes;
-//   dez.deserialize(op_type);
-//   dez.deserialize(axes_size);
-//   for (size_t i = 0; i < axes_size; i++) {
-//     int dim_idx;
-//     dez.deserialize(dim_idx);
-//     axes.push_back(dim_idx);
-//   }
-//   dez.deserialize(keepdims);
-//   return ff.get_or_create_node<Reduce>(inputs[0], {axes, op_type, keepdims});
-// }
-
-// Op *Reduce::materialize(FFModel &ff,
-//                         ParallelTensor inputs[],
-//                         int num_inputs) const {
-//   ReduceParams params = get_params();
-//   return new Reduce(ff, params, inputs[0], this->name);
-// }
-
-// }; // namespace FlexFlow
-
-// namespace std {
-// size_t hash<FlexFlow::ReduceParams>::operator()(
-//     FlexFlow::ReduceParams const &params) const {
-//   size_t key = 0;
-//   hash_combine(key, params.op_type);
-//   hash_combine(key, params.axes.size());
-//   for (int n : params.axes) {
-//     hash_combine(key, n);
-//   }
-//   hash_combine(key, params.keepdims);
-//   return key;
-// }
 }; // namespace FlexFlow

From bb542fdcb36e8cc41781c4248b103d97005188af Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 27 Sep 2023 20:38:34 +0000
Subject: [PATCH 09/13] use exceptions in reduces.cc

---
 lib/runtime/src/ops/reduce.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index bfcf64c186..fdf381910e 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -2,7 +2,7 @@
 #include "kernels/reduce_kernels.h"
 #include "legion/legion_utilities.h"
 #include "op-attrs/get_output_shape.h"
-#include "utils/exception.decl.h"
+#include "utils/exceptions.h"
 #include "utils/hash-utils.h"
 #include "utils/type_traits_core.h"
 

From ae22a95b621199cf20fbbecc8b6decb933e0d1f2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sat, 30 Sep 2023 13:16:53 +0000
Subject: [PATCH 10/13] fix the typo

---
 lib/runtime/src/ops/reduce.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index fdf381910e..49b1be4f1e 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -95,10 +95,10 @@ static DeviceSpecific<TransposePerDeviceState>
   return init_task_impl(acc);
 }
 
-plate<> void register_task<TRANSPOSE_INIT_TASK_ID>() {
+template<> void register_task<TRANSPOSE_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT)
 
-      init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
 
   register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);

From 0573b8874f4ac3c378b08993b64389911f8ca31b Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 10 Oct 2023 20:34:26 +0000
Subject: [PATCH 11/13] fix the reduce.cc

---
 lib/runtime/src/ops/reduce.cc | 33 ++++++---------------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 49b1be4f1e..fe7130ca88 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -25,29 +25,6 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reduce;
 
-bool operator==(ReduceParams const &lhs, ReduceParams const &rhs) {
-  return (lhs.axes == rhs.axes) && (lhs.keepdims == rhs.keepdims);
-}
-
-bool ReduceParams::is_valid(ParallelTensorShape const &input) const {
-  for (size_t i = 0; i < axes.size(); i++) {
-    if (axes[i] >= input.num_dims) {
-      return false;
-    }
-  }
-  return input.is_valid();
-}
-
-ReduceParams Reduce::get_params() const {
-  ReduceParams params;
-  params.axes.clear();
-  for (int i = 0; i < num_axes; i++) {
-    params.axes.push_back(this->axes[i]);
-  }
-  params.keepdims = keepdims;
-  return params;
-}
-
 enum Slots {
   INPUT,
   OUTPUT,
@@ -64,8 +41,8 @@ OpTaskInvocation init(TransposeAttrs const &attrs) {
   binding.bind_arg(HANDLE, ff_handle());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_parallel_tensor_shape(0));
-  binding.bind(OUTPUT, output_parallel_tensor_shape(0));
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
   return {REDUCE_INIT_TASK_ID, binding};
 }
@@ -101,6 +78,8 @@ template<> void register_task<TRANSPOSE_INIT_TASK_ID>() {
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
 
+  init.add_return_value<ReducePerDeviceState>();
+
   register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
 }
 
@@ -111,7 +90,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) {
   bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
   bind.bind_arg(PROFILING, profiling_tensor());
 
-  binding.bind(INPUT, input_parallel_tensor_shape(0));
+  binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
 
   return {REDUCE_FWD_TASK_ID, binding};
@@ -209,7 +188,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   SimTaskBinding fwd_binding;
   ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
-  fwd.bind(INPUT, input);
+  fwd.bind(INPUT, input.shape);
   fwd.bind(OUTPUT, output_shape);
   fwd.bind_arg(PROFILING, settings);
   fwd.bind_arg(PER_DEVICE_STATE, per_device_state);

From dba438d992723090bc8412f08c8555b64946f2ca Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 10 Oct 2023 20:34:51 +0000
Subject: [PATCH 12/13] format the code

---
 lib/kernels/include/kernels/reduce_kernels.h | 12 ++++++------
 lib/kernels/src/cuda/reduce_kernels.cu       | 16 +++++++++-------
 lib/runtime/src/ops/reduce.cc                |  7 ++++---
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 9db7bd9aea..1220a93203 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -15,12 +15,12 @@ struct ReducePerDeviceState {
 };
 
 FF_VISITABLE_STRUCT(ReducePerDeviceState,
-                          handle,
-                          inputTensor,
-                          outputTensor,
-                          reduceDesc,
-                          op_type,
-                          reduction_size);
+                    handle,
+                    inputTensor,
+                    outputTensor,
+                    reduceDesc,
+                    op_type,
+                    reduction_size);
 
 namespace Kernels {
 namespace Reduce {
diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu
index b7a5c824f2..a67f52999d 100644
--- a/lib/kernels/src/cuda/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/reduce_kernels.cu
@@ -70,22 +70,24 @@ namespace Reduce {
 ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle,
                                  OperatorType const &op_type,
                                  size_t const &reduction_size,
-                                 ArrayShape const & input_shape,
-                                 ArrayShape const & output_shape) {
-  
+                                 ArrayShape const &input_shape,
+                                 ArrayShape const &output_shape) {
+
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
   ffReduceTensorDescriptor_t reduceDesc;
 
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
-  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));;
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+  ;
   checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
 
   checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape));
-  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
-
-  ReducePerDeviceState per_device = {handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size};
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
 
+  ReducePerDeviceState per_device = {
+      handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size};
 }
 
 void forward_kernel(cudaStream_t stream,
diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index fe7130ca88..6932e645b4 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -56,7 +56,7 @@ static DeviceSpecific<ReducePerDeviceState>
 
   OperatorType = attrs.op_type;
   // Note: How to set the reduction size?
-  size_t reduction_size ;
+  size_t reduction_size;
   DeviceSpecific<ReducePerDeviceState> per_device_state =
       acc.create_device_specific<ReducePerDeviceState>(init_kernel(
           handle, op_type, reduction_size, input.shape, output.shape));
@@ -72,10 +72,11 @@ static DeviceSpecific<TransposePerDeviceState>
   return init_task_impl(acc);
 }
 
-template<> void register_task<TRANSPOSE_INIT_TASK_ID>() {
+template <>
+void register_task<TRANSPOSE_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT)
 
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+      init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
 
   init.add_return_value<ReducePerDeviceState>();

From 925356e46e508b5971b23a27d526b448fbc62c31 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 29 Oct 2023 00:01:51 +0000
Subject: [PATCH 13/13] add reduction size

---
 lib/runtime/src/ops/reduce.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 6932e645b4..2674dc4fef 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -56,7 +56,7 @@ static DeviceSpecific<ReducePerDeviceState>
 
   OperatorType = attrs.op_type;
   // Note: How to set the reduction size?
-  size_t reduction_size;
+  size_t reduction_size = input.shape.get_volume() / output.shape.get_volume();
   DeviceSpecific<ReducePerDeviceState> per_device_state =
       acc.create_device_specific<ReducePerDeviceState>(init_kernel(
           handle, op_type, reduction_size, input.shape, output.shape));