From 5ee6acd093b97d6768e8078a3a830725327a783e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 6 Sep 2023 08:07:04 +0000
Subject: [PATCH 01/16] start to do the reshape operator

---
 deps/fmt                                      |   2 +-
 lib/kernels/include/kernels/reshape_kernels.h |  17 +-
 lib/runtime/src/ops/replicate.h               |   4 +-
 lib/runtime/src/ops/reshape.cc                | 850 ++++++++++--------
 lib/runtime/src/ops/reshape.h                 |   4 +-
 lib/runtime/src/tasks.h                       |   2 +-
 6 files changed, 473 insertions(+), 406 deletions(-)
diff --git a/deps/fmt b/deps/fmt
index f5e54359df..a33701196a 160000
--- a/deps/fmt
+++ b/deps/fmt
@@ -1 +1 @@
-Subproject commit f5e54359df4c26b6230fc61d38aa294581393084
+Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index 7cb30254f6..0ce07ae88b 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -1,27 +1,32 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 
+#include "attention_kernels.h"
+#include "datatype_dispatch.h"
 #include "kernels/accessor.h"
 #include "kernels/device.h"
+#include "utils/required_core.h"
 
 namespace FlexFlow {
 
-class ReshapePerDeviceState : public PerDeviceOpState {
-public:
-  ReshapePerDeviceState(FFHandler handler);
-  DataType data_type;
+struct ReshapePerDeviceState  {
+  req<DataType> data_type;
 };
 
+FF_VISITABLE_STRUCT_NO_EQ(ReshapePerDeviceState, data_type);
+
+ReshapePerDeviceState init_kernel(DataType data_type);
+
 namespace Kernels {
 namespace Reshape {
 
 void forward_kernel(ffStream_t stream,
-                    ReshapePerDeviceState const *m,
+                    ReshapePerDeviceState const & meta,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     ReshapePerDeviceState const *m,
+                     ReshapePerDeviceState const & meta,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output);
 
diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h
index fd5ffd9ef9..083998414e 100644
--- a/lib/runtime/src/ops/replicate.h
+++ b/lib/runtime/src/ops/replicate.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_REPLICATE_H
 
 #include "op-attrs/ops/replicate.h"
-#include "op_task_invocation.h"
+#include "task_spec/op_task_invocation.h"
 #include "sim_environment.h"
 
 namespace FlexFlow {
@@ -20,7 +20,7 @@ OpTaskInvocation backward(ReplicateAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReplicateAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 71fb10bc9c..45c69d210d 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -46,418 +46,480 @@ bool ReshapeParams::is_valid(ParallelTensorShape const &input) const {
   return input.is_valid();
 }
 
-Tensor FFModel::reshape(const Tensor input,
-                        std::vector<int> const &shape,
-                        char const *name) {
-  Layer *reshape = new Layer(this,
-                             OP_RESHAPE,
-                             DT_FLOAT,
-                             name,
-                             1 /*inputs*/,
-                             0 /*weights*/,
-                             1 /*outputs*/,
-                             input);
-  int dims[MAX_TENSOR_DIM];
-  int numdim = shape.size();
-  for (int i = 0; i < numdim; i++) {
-    assert(shape[i] > 0);
-    dims[i] = shape[i];
-  }
-  reshape->outputs[0] = create_tensor(
-      numdim, dims, input->data_type, reshape, 0, true /*create_grad*/);
-  reshape->add_int_vector_property("shape", shape);
-  layers.push_back(reshape);
-  return reshape->outputs[0];
-}
+enum slots {INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
 
-Op *Reshape::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  std::vector<int> shape;
-  layer->get_int_vector_property("shape", shape);
-  return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name);
-}
 
-Reshape::Reshape(FFModel &model,
-                 LayerID const &_layer_guid,
-                 const ParallelTensor input,
-                 std::vector<int> const &_shape,
-                 char const *name)
-    : Op(model,
-         OP_RESHAPE,
-         input->data_type,
-         name,
-         1 /*inputs*/,
-         0 /*weights*/,
-         1 /*outputs*/,
-         input) {
-  layer_guid = _layer_guid;
-  shape_length = _shape.size();
-  assert(shape_length <= MAX_TENSOR_DIM);
-  for (int i = 0; i < shape_length; i++) {
-    shape_array[i] = _shape[i];
-  }
-  numOutputs = 1;
-  numWeights = 0;
-  int num_replica_dims = 0;
-  for (int i = 0; i < input->num_dims; i++) {
-    if (input->dims[i].is_replica_dim) {
-      num_replica_dims++;
-    }
-  }
-  // assert that all replica dims are leading dims
-  for (int i = 0; i < num_replica_dims; i++) {
-    assert(input->dims[input->num_dims - 1 - i].is_replica_dim);
-  }
-  int numdim = (int)_shape.size();
-  ParallelDim dims[MAX_TENSOR_DIM];
-  for (int i = 0; i < numdim; i++) {
-    dims[i].size = _shape[numdim - 1 - i];
-    dims[i].degree = 1;
-    dims[i].parallel_idx = -1;
-    dims[i].is_replica_dim = false;
-  }
-  // copy all replica dims
-  for (int i = 0; i < num_replica_dims; i++) {
-    dims[i + numdim] = input->dims[input->num_dims - 1 - i];
-  }
-  numdim += num_replica_dims;
-  for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) {
-    if (dims[numdim - 1 - i].size !=
-        input->dims[input->num_dims - 1 - i].size) {
-      break;
-    }
-    dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i];
-  }
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      numdim, dims, input->data_type, this);
-  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
-}
+OpTaskInvocation init(ReshapeAttrs const & attrs) {
+  OpTaskBinding binding;
 
-Reshape::Reshape(FFModel &model,
-                 ReshapeParams const &params,
-                 const ParallelTensor input,
-                 char const *name)
-    : Reshape(model, params.layer_guid, input, params.shape, name) {}
-
-void Reshape::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(RESHAPE_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(Reshape)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-}
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
-PerDeviceOpState *Reshape::init_task(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  Reshape const *reshape = (Reshape *)task->args;
-  FFHandler handle = *((FFHandler const *)task->local_args);
-  ReshapeMeta *m = new ReshapeMeta(handle);
-  m->data_type = reshape->outputs[0]->data_type;
-  return m;
+  return {RESHAPE_INIT_TASK_ID, binding};
 }
 
-void Reshape::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(RESHAPE_FWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
+OpTaskInvocation forward(ReshapeAttrs const & attrs) {
+  OpTaskBinding binding;
 
-void Reshape::forward_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // const Reshape* reshape = (const Reshape*) task->args;
-  ReshapeMeta const *m = *((ReshapeMeta **)task->local_args);
-  Domain in_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  Domain out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  assert(in_domain.get_volume() == out_domain.get_volume());
-
-  if (m->data_type == DT_FLOAT) {
-    float const *in_ptr = helperGetTensorPointerRO<float>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    float *out_ptr = helperGetTensorPointerWO<float>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    forward_kernel_wrapper<float>(in_ptr, out_ptr, in_domain.get_volume());
-  } else if (m->data_type == DT_DOUBLE) {
-    double const *in_ptr = helperGetTensorPointerRO<double>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    double *out_ptr = helperGetTensorPointerWO<double>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    forward_kernel_wrapper<double>(in_ptr, out_ptr, in_domain.get_volume());
-  } else if (m->data_type == DT_INT32) {
-    int32_t const *in_ptr = helperGetTensorPointerRO<int32_t>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    int32_t *out_ptr = helperGetTensorPointerWO<int32_t>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    forward_kernel_wrapper<int32_t>(in_ptr, out_ptr, in_domain.get_volume());
-  } else if (m->data_type == DT_INT64) {
-    int64_t const *in_ptr = helperGetTensorPointerRO<int64_t>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    int64_t *out_ptr = helperGetTensorPointerWO<int64_t>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    forward_kernel_wrapper<int64_t>(in_ptr, out_ptr, in_domain.get_volume());
-  } else {
-    assert(false && "Unsupported data type in Reshape forward");
-  }
-}
+  binding.bind(PER_DEVICE_STATE, per_device_op_state<ReshapeMeta>());
+  binding.bind(PROFILING, profiling_settings());
 
-void Reshape::backward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
-  IndexLauncher launcher(RESHAPE_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  // regions[0](I): output_grad
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  // regions[3](I/O): input0_grad
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
+  return {RESHAPE_FWD_TASK_ID, binding};
 }
 
-ReshapeParams Reshape::get_params() const {
-  std::vector<int> shape_vec;
-  for (size_t i = 0; i < shape_length; i++) {
-    shape_vec.push_back(shape_array[i]);
-  }
-  ReshapeParams params;
-  params.shape = shape_vec;
-  params.layer_guid = this->layer_guid;
-  return params;
-}
+OpTaskInvocation backward(ReshapeAttrs const & attrs) {
+  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-void Reshape::backward_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // const Reshape* reshape = (const Reshape*) task->args;
-  ReshapeMeta const *m = *((ReshapeMeta **)task->local_args);
-  Domain out_grad_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
-  Domain in_grad_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  assert(in_grad_domain.get_volume() == out_grad_domain.get_volume());
-
-  if (m->data_type == DT_FLOAT) {
-    float const *out_grad_ptr = helperGetTensorPointerRO<float>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    float *in_grad_ptr = helperGetTensorPointerRW<float>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    backward_kernel_wrapper<float>(
-        in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-  } else if (m->data_type == DT_DOUBLE) {
-    double const *out_grad_ptr = helperGetTensorPointerRO<double>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    double *in_grad_ptr = helperGetTensorPointerRW<double>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    backward_kernel_wrapper<double>(
-        in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-  } else if (m->data_type == DT_INT32) {
-    int32_t const *out_grad_ptr = helperGetTensorPointerRO<int32_t>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    int32_t *in_grad_ptr = helperGetTensorPointerRW<int32_t>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    backward_kernel_wrapper<int32_t>(
-        in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-  } else if (m->data_type == DT_INT64) {
-    int64_t const *out_grad_ptr = helperGetTensorPointerRO<int64_t>(
-        regions[0], task->regions[0], FID_DATA, ctx, runtime);
-    int64_t *in_grad_ptr = helperGetTensorPointerRW<int64_t>(
-        regions[1], task->regions[1], FID_DATA, ctx, runtime);
-    backward_kernel_wrapper<int64_t>(
-        in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-  } else {
-    assert(false && "Unsupported data type in Reshape backward");
-  }
+  return {RESHAPE_BWD_TASK_ID, binding};
 }
 
-bool Reshape::measure_operator_cost(Simulator *sim,
-                                    MachineView const &mv,
-                                    CostMetrics &cost_metrics) const {
-  ParallelTensorBase sub_input, sub_output;
-  if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-    return false;
-  }
-  if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-    return false;
-  }
-
-  sim->free_all();
-  float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-  assert(input_ptr != NULL);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-
-  float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-  assert(output_ptr != NULL);
-  cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-
-  assert(sub_output.get_volume() == sub_input.get_volume());
-  size_t num_elements = sub_input.get_volume();
-
-  std::function<void()> forward, backward;
-  forward = [&] {
-    forward_kernel_wrapper(input_ptr, output_ptr, num_elements);
-  };
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    float *input_grad_ptr =
-        (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-    assert(input_grad_ptr != NULL);
-    cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-
-    float *output_grad_ptr =
-        (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-    assert(output_grad_ptr != NULL);
-    cost_metrics.outputs_memory +=
-        cost_metrics.total_mem_diff_from(sim->offset);
-
-    backward = [&] {
-      backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements);
-    };
-  }
-
-  inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    printf(
-        "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n",
-        name,
-        cost_metrics.forward_time,
-        cost_metrics.backward_time);
-  } else {
-    printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time);
-  }
-  return true;
-}
 
-void Reshape::serialize(Legion::Serializer &sez) const {
-  sez.serialize(this->shape_length);
-  for (size_t i = 0; i < this->shape_length; i++) {
-    sez.serialize(this->shape_array[i]);
-  }
-  sez.serialize(this->layer_guid.id);
-}
 
-using PCG::Node;
-
-Node Reshape::deserialize(FFModel &ff,
-                          Legion::Deserializer &dez,
-                          ParallelTensor inputs[],
-                          int num_inputs) {
-  assert(num_inputs == 1);
-  size_t shape_length;
-  std::vector<int> shape;
-  dez.deserialize(shape_length);
-  for (size_t i = 0; i < shape_length; i++) {
-    int value;
-    dez.deserialize(value);
-    shape.push_back(value);
-  }
-  size_t id;
-  dez.deserialize(id);
-  LayerID layer_guid(id);
-
-  ReshapeParams params;
-  params.shape = shape;
-  params.layer_guid = layer_guid;
-  return ff.get_or_create_node<Reshape>(inputs[0], params);
+template <>
+void register_task<RESHAPE_INIT_TASK_ID>() {
+    OpTaskSignature init(OpTaskType::INIT);
+
+    init.add_input_slots(INPUT);
+    init.add_output_slots(OUTPUT);
+
+    register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
 }
 
-Op *Reshape::materialize(FFModel &ff,
-                         ParallelTensor inputs[],
-                         int num_inputs) const {
-  assert(num_inputs == 1);
-  std::vector<int> shape;
-  for (size_t i = 0; i < this->shape_length; i++) {
-    shape.push_back(shape_array[i]);
-  }
-  return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name);
+template <>
+void register_task<RESHAPE_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FWD);
+
+  fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_unchecked_arg_slot<ReshapePerDeviceState>(PER_DEVICE_STATE);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+
+  register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task);
 }
 
-}; // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::ReshapeParams>::operator()(
-    FlexFlow::ReshapeParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.shape.size());
-  for (int n : params.shape) {
-    hash_combine(key, n);
-  }
-  hash_combine(key, params.layer_guid.id);
-  return key;
+template <>
+void register_task<RESHAPE_BWD_TASK_ID>() {
+  OpTaskSignature bwd = infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
+
+  register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task);
 }
+
+// Tensor FFModel::reshape(const Tensor input,
+//                         std::vector<int> const &shape,
+//                         char const *name) {
+//   Layer *reshape = new Layer(this,
+//                              OP_RESHAPE,
+//                              DT_FLOAT,
+//                              name,
+//                              1 /*inputs*/,
+//                              0 /*weights*/,
+//                              1 /*outputs*/,
+//                              input);
+//   int dims[MAX_TENSOR_DIM];
+//   int numdim = shape.size();
+//   for (int i = 0; i < numdim; i++) {
+//     assert(shape[i] > 0);
+//     dims[i] = shape[i];
+//   }
+//   reshape->outputs[0] = create_tensor(
+//       numdim, dims, input->data_type, reshape, 0, true /*create_grad*/);
+//   reshape->add_int_vector_property("shape", shape);
+//   layers.push_back(reshape);
+//   return reshape->outputs[0];
+// }
+
+// Op *Reshape::create_operator_from_layer(
+//     FFModel &model,
+//     Layer const *layer,
+//     std::vector<ParallelTensor> const &inputs) {
+//   std::vector<int> shape;
+//   layer->get_int_vector_property("shape", shape);
+//   return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name);
+// }
+
+// Reshape::Reshape(FFModel &model,
+//                  LayerID const &_layer_guid,
+//                  const ParallelTensor input,
+//                  std::vector<int> const &_shape,
+//                  char const *name)
+//     : Op(model,
+//          OP_RESHAPE,
+//          input->data_type,
+//          name,
+//          1 /*inputs*/,
+//          0 /*weights*/,
+//          1 /*outputs*/,
+//          input) {
+//   layer_guid = _layer_guid;
+//   shape_length = _shape.size();
+//   assert(shape_length <= MAX_TENSOR_DIM);
+//   for (int i = 0; i < shape_length; i++) {
+//     shape_array[i] = _shape[i];
+//   }
+//   numOutputs = 1;
+//   numWeights = 0;
+//   int num_replica_dims = 0;
+//   for (int i = 0; i < input->num_dims; i++) {
+//     if (input->dims[i].is_replica_dim) {
+//       num_replica_dims++;
+//     }
+//   }
+//   // assert that all replica dims are leading dims
+//   for (int i = 0; i < num_replica_dims; i++) {
+//     assert(input->dims[input->num_dims - 1 - i].is_replica_dim);
+//   }
+//   int numdim = (int)_shape.size();
+//   ParallelDim dims[MAX_TENSOR_DIM];
+//   for (int i = 0; i < numdim; i++) {
+//     dims[i].size = _shape[numdim - 1 - i];
+//     dims[i].degree = 1;
+//     dims[i].parallel_idx = -1;
+//     dims[i].is_replica_dim = false;
+//   }
+//   // copy all replica dims
+//   for (int i = 0; i < num_replica_dims; i++) {
+//     dims[i + numdim] = input->dims[input->num_dims - 1 - i];
+//   }
+//   numdim += num_replica_dims;
+//   for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) {
+//     if (dims[numdim - 1 - i].size !=
+//         input->dims[input->num_dims - 1 - i].size) {
+//       break;
+//     }
+//     dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i];
+//   }
+//   outputs[0] = model.create_parallel_tensor_legion_ordering(
+//       numdim, dims, input->data_type, this);
+//   assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+// }
+
+// Reshape::Reshape(FFModel &model,
+//                  ReshapeParams const &params,
+//                  const ParallelTensor input,
+//                  char const *name)
+//     : Reshape(model, params.layer_guid, input, params.shape, name) {}
+
+// void Reshape::init(FFModel const &ff) {
+//   assert(check_output_input_weight_same_parallel_is());
+//   parallel_is = outputs[0]->parallel_is;
+//   ArgumentMap argmap;
+//   Context ctx = ff.config.lg_ctx;
+//   Runtime *runtime = ff.config.lg_hlr;
+//   set_argumentmap_for_init(ff, argmap);
+//   IndexLauncher launcher(RESHAPE_INIT_TASK_ID,
+//                          parallel_is,
+//                          TaskArgument(this, sizeof(Reshape)),
+//                          argmap,
+//                          Predicate::TRUE_PRED,
+//                          false /*must*/,
+//                          0 /*mapper_id*/,
+//                          outputs[0]->machine_view.hash());
+//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     READ_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     inputs[0]->region));
+//   launcher.add_field(0, FID_DATA);
+//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     WRITE_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     outputs[0]->region));
+//   launcher.add_field(1, FID_DATA);
+//   FutureMap fm = runtime->execute_index_space(ctx, launcher);
+//   fm.wait_all_results();
+//   set_opmeta_from_futuremap(ff, fm);
+// }
+
+// PerDeviceOpState *Reshape::init_task(Task const *task,
+//                                      std::vector<PhysicalRegion> const &regions,
+//                                      Context ctx,
+//                                      Runtime *runtime) {
+//   Reshape const *reshape = (Reshape *)task->args;
+//   FFHandler handle = *((FFHandler const *)task->local_args);
+//   ReshapeMeta *m = new ReshapeMeta(handle);
+//   m->data_type = reshape->outputs[0]->data_type;
+//   return m;
+// }
+
+// void Reshape::forward(FFModel const &ff) {
+//   ArgumentMap argmap;
+//   Context ctx = ff.config.lg_ctx;
+//   Runtime *runtime = ff.config.lg_hlr;
+//   set_argumentmap_for_forward(ff, argmap);
+//   IndexLauncher launcher(RESHAPE_FWD_TASK_ID,
+//                          parallel_is,
+//                          TaskArgument(NULL, 0),
+//                          argmap,
+//                          Predicate::TRUE_PRED,
+//                          false /*must*/,
+//                          0 /*mapper_id*/,
+//                          outputs[0]->machine_view.hash());
+//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     READ_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     inputs[0]->region));
+//   launcher.add_field(0, FID_DATA);
+//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+//                                                     0 /*projection id*/,
+//                                                     WRITE_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     outputs[0]->region));
+//   launcher.add_field(1, FID_DATA);
+//   runtime->execute_index_space(ctx, launcher);
+// }
+
+// void Reshape::forward_task(Task const *task,
+//                            std::vector<PhysicalRegion> const &regions,
+//                            Context ctx,
+//                            Runtime *runtime) {
+//   assert(regions.size() == 2);
+//   assert(task->regions.size() == 2);
+//   // const Reshape* reshape = (const Reshape*) task->args;
+//   ReshapeMeta const *m = *((ReshapeMeta **)task->local_args);
+//   Domain in_domain = runtime->get_index_space_domain(
+//       ctx, task->regions[0].region.get_index_space());
+//   Domain out_domain = runtime->get_index_space_domain(
+//       ctx, task->regions[1].region.get_index_space());
+//   assert(in_domain.get_volume() == out_domain.get_volume());
+
+//   if (m->data_type == DT_FLOAT) {
+//     float const *in_ptr = helperGetTensorPointerRO<float>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     float *out_ptr = helperGetTensorPointerWO<float>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     forward_kernel_wrapper<float>(in_ptr, out_ptr, in_domain.get_volume());
+//   } else if (m->data_type == DT_DOUBLE) {
+//     double const *in_ptr = helperGetTensorPointerRO<double>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     double *out_ptr = helperGetTensorPointerWO<double>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     forward_kernel_wrapper<double>(in_ptr, out_ptr, in_domain.get_volume());
+//   } else if (m->data_type == DT_INT32) {
+//     int32_t const *in_ptr = helperGetTensorPointerRO<int32_t>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     int32_t *out_ptr = helperGetTensorPointerWO<int32_t>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     forward_kernel_wrapper<int32_t>(in_ptr, out_ptr, in_domain.get_volume());
+//   } else if (m->data_type == DT_INT64) {
+//     int64_t const *in_ptr = helperGetTensorPointerRO<int64_t>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     int64_t *out_ptr = helperGetTensorPointerWO<int64_t>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     forward_kernel_wrapper<int64_t>(in_ptr, out_ptr, in_domain.get_volume());
+//   } else {
+//     assert(false && "Unsupported data type in Reshape forward");
+//   }
+// }
+
+// void Reshape::backward(FFModel const &ff) {
+//   ArgumentMap argmap;
+//   Context ctx = ff.config.lg_ctx;
+//   Runtime *runtime = ff.config.lg_hlr;
+//   set_argumentmap_for_backward(ff, argmap);
+//   IndexLauncher launcher(RESHAPE_BWD_TASK_ID,
+//                          parallel_is,
+//                          TaskArgument(NULL, 0),
+//                          argmap,
+//                          Predicate::TRUE_PRED,
+//                          false /*must*/,
+//                          0 /*mapper_id*/,
+//                          outputs[0]->machine_view.hash());
+//   // regions[0](I): output_grad
+//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+//                                                     0 /*projection id*/,
+//                                                     READ_ONLY,
+//                                                     EXCLUSIVE,
+//                                                     outputs[0]->region_grad));
+//   launcher.add_field(0, FID_DATA);
+//   // regions[3](I/O): input0_grad
+//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+//                                                     0 /*projection id*/,
+//                                                     READ_WRITE,
+//                                                     EXCLUSIVE,
+//                                                     inputs[0]->region_grad));
+//   launcher.add_field(1, FID_DATA);
+//   runtime->execute_index_space(ctx, launcher);
+// }
+
+// ReshapeParams Reshape::get_params() const {
+//   std::vector<int> shape_vec;
+//   for (size_t i = 0; i < shape_length; i++) {
+//     shape_vec.push_back(shape_array[i]);
+//   }
+//   ReshapeParams params;
+//   params.shape = shape_vec;
+//   params.layer_guid = this->layer_guid;
+//   return params;
+// }
+
+// void Reshape::backward_task(Task const *task,
+//                             std::vector<PhysicalRegion> const &regions,
+//                             Context ctx,
+//                             Runtime *runtime) {
+//   assert(regions.size() == 2);
+//   assert(task->regions.size() == 2);
+//   // const Reshape* reshape = (const Reshape*) task->args;
+//   ReshapeMeta const *m = *((ReshapeMeta **)task->local_args);
+//   Domain out_grad_domain = runtime->get_index_space_domain(
+//       ctx, task->regions[0].region.get_index_space());
+//   Domain in_grad_domain = runtime->get_index_space_domain(
+//       ctx, task->regions[1].region.get_index_space());
+//   assert(in_grad_domain.get_volume() == out_grad_domain.get_volume());
+
+//   if (m->data_type == DT_FLOAT) {
+//     float const *out_grad_ptr = helperGetTensorPointerRO<float>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     float *in_grad_ptr = helperGetTensorPointerRW<float>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     backward_kernel_wrapper<float>(
+//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
+//   } else if (m->data_type == DT_DOUBLE) {
+//     double const *out_grad_ptr = helperGetTensorPointerRO<double>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     double *in_grad_ptr = helperGetTensorPointerRW<double>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     backward_kernel_wrapper<double>(
+//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
+//   } else if (m->data_type == DT_INT32) {
+//     int32_t const *out_grad_ptr = helperGetTensorPointerRO<int32_t>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     int32_t *in_grad_ptr = helperGetTensorPointerRW<int32_t>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     backward_kernel_wrapper<int32_t>(
+//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
+//   } else if (m->data_type == DT_INT64) {
+//     int64_t const *out_grad_ptr = helperGetTensorPointerRO<int64_t>(
+//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
+//     int64_t *in_grad_ptr = helperGetTensorPointerRW<int64_t>(
+//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
+//     backward_kernel_wrapper<int64_t>(
+//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
+//   } else {
+//     assert(false && "Unsupported data type in Reshape backward");
+//   }
+// }
+
+// bool Reshape::measure_operator_cost(Simulator *sim,
+//                                     MachineView const &mv,
+//                                     CostMetrics &cost_metrics) const {
+//   ParallelTensorBase sub_input, sub_output;
+//   if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
+//     return false;
+//   }
+//   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
+//     return false;
+//   }
+
+//   sim->free_all();
+//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+//   assert(input_ptr != NULL);
+//   cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+
+//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+//   assert(output_ptr != NULL);
+//   cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+
+//   assert(sub_output.get_volume() == sub_input.get_volume());
+//   size_t num_elements = sub_input.get_volume();
+
+//   std::function<void()> forward, backward;
+//   forward = [&] {
+//     forward_kernel_wrapper(input_ptr, output_ptr, num_elements);
+//   };
+//   if (sim->computationMode == COMP_MODE_TRAINING) {
+//     float *input_grad_ptr =
+//         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+//     assert(input_grad_ptr != NULL);
+//     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+
+//     float *output_grad_ptr =
+//         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+//     assert(output_grad_ptr != NULL);
+//     cost_metrics.outputs_memory +=
+//         cost_metrics.total_mem_diff_from(sim->offset);
+
+//     backward = [&] {
+//       backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements);
+//     };
+//   }
+
+//   inner_measure_operator_cost(sim, forward, backward, cost_metrics);
+
+//   if (sim->computationMode == COMP_MODE_TRAINING) {
+//     printf(
+//         "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n",
+//         name,
+//         cost_metrics.forward_time,
+//         cost_metrics.backward_time);
+//   } else {
+//     printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n",
+//            name,
+//            cost_metrics.forward_time);
+//   }
+//   return true;
+// }
+
+// void Reshape::serialize(Legion::Serializer &sez) const {
+//   sez.serialize(this->shape_length);
+//   for (size_t i = 0; i < this->shape_length; i++) {
+//     sez.serialize(this->shape_array[i]);
+//   }
+//   sez.serialize(this->layer_guid.id);
+// }
+
+// using PCG::Node;
+
+// Node Reshape::deserialize(FFModel &ff,
+//                           Legion::Deserializer &dez,
+//                           ParallelTensor inputs[],
+//                           int num_inputs) {
+//   assert(num_inputs == 1);
+//   size_t shape_length;
+//   std::vector<int> shape;
+//   dez.deserialize(shape_length);
+//   for (size_t i = 0; i < shape_length; i++) {
+//     int value;
+//     dez.deserialize(value);
+//     shape.push_back(value);
+//   }
+//   size_t id;
+//   dez.deserialize(id);
+//   LayerID layer_guid(id);
+
+//   ReshapeParams params;
+//   params.shape = shape;
+//   params.layer_guid = layer_guid;
+//   return ff.get_or_create_node<Reshape>(inputs[0], params);
+// }
+
+// Op *Reshape::materialize(FFModel &ff,
+//                          ParallelTensor inputs[],
+//                          int num_inputs) const {
+//   assert(num_inputs == 1);
+//   std::vector<int> shape;
+//   for (size_t i = 0; i < this->shape_length; i++) {
+//     shape.push_back(shape_array[i]);
+//   }
+//   return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name);
+// }
+
+// }; // namespace FlexFlow
+
+// namespace std {
+// size_t hash<FlexFlow::ReshapeParams>::operator()(
+//     FlexFlow::ReshapeParams const &params) const {
+//   size_t key = 0;
+//   hash_combine(key, params.shape.size());
+//   for (int n : params.shape) {
+//     hash_combine(key, n);
+//   }
+//   hash_combine(key, params.layer_guid.id);
+//   return key;
+// }
+
 }; // namespace std
diff --git a/lib/runtime/src/ops/reshape.h b/lib/runtime/src/ops/reshape.h
index 42bbefd9db..85370d9ede 100644
--- a/lib/runtime/src/ops/reshape.h
+++ b/lib/runtime/src/ops/reshape.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_RESHAPE_H
 
 #include "op-attrs/ops/reshape.h"
-#include "op_task_invocation.h"
+#include "task_spec/op_task_invocation.h"
 #include "sim_environment.h"
 
 namespace FlexFlow {
@@ -20,7 +20,7 @@ OpTaskInvocation backward(ReshapeAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReshapeAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 
diff --git a/lib/runtime/src/tasks.h b/lib/runtime/src/tasks.h
index 5c1dcb0698..5d36502f6f 100644
--- a/lib/runtime/src/tasks.h
+++ b/lib/runtime/src/tasks.h
@@ -73,7 +73,7 @@ enum task_id_t {
   REDUCE_INIT_TASK_ID,
   REDUCE_FWD_TASK_ID,
   REDUCE_BWD_TASK_ID,
-  RESHAPE_INIT_TASK_ID,
+  _INIT_TASK_IDRESHAPE,
   RESHAPE_FWD_TASK_ID,
   RESHAPE_BWD_TASK_ID,
   REVERSE_INIT_TASK_ID,

From 0eaee690703ec2fcb0fa3526ec080abdf4c5ebd2 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 6 Sep 2023 08:17:12 +0000
Subject: [PATCH 02/16] add init_task,forward_task, forward_task for reshape

---
 lib/runtime/src/ops/reshape.cc | 45 ++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 45c69d210d..82b8ae2adc 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -16,6 +16,7 @@
 #include "reshape.h"
 #include "kernels/reshape_kernels.h"
 #include "legion/legion_utilities.h"
+#include "utils/exception.decl.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
@@ -75,7 +76,51 @@ OpTaskInvocation backward(ReshapeAttrs const & attrs) {
   return {RESHAPE_BWD_TASK_ID, binding};
 }
 
+static  DeviceSpecific<ReshapePerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+  NOT_IMPLEMENTED();
+}
+
+static  DeviceSpecific<ReshapePerDeviceState> init_task(Task const *task,
+              std::vector<PhysicalRegion> const &regions,
+              Context ctx,
+              Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  return init_task_impl(acc);
+}
+
+static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  NOT_IMPLEMENTED();
+}
+
+static void forward_task(Task const *task,
+                         std::vector<PhysicalRegion> const &regions,
+                         Context ctx,
+                         Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  forward_task_impl(acc);
+}
+
+static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+  NOT_IMPLEMENTED();
+}
+
+static void backward_task(Task const *task,
+                          std::vector<PhysicalRegion> const &regions,
+                          Context ctx,
+                          Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  backward_task_impl(acc);
+}
+
+
+CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
+                                  ReshapeAttrs const &attrs,
+                                  InputParallelTensorDesc const &input,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &machine_view) {
 
+    NOT_IMPLEMENTED();
+  }
 
 template <>
 void register_task<RESHAPE_INIT_TASK_ID>() {

From 15ed89cccb5d0ad9aea4bc87eff5a14c6b1d4ef1 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 6 Sep 2023 08:50:48 +0000
Subject: [PATCH 03/16] reshape version0.1 done

---
 lib/kernels/include/kernels/reshape_kernels.h |  6 +-
 lib/runtime/src/ops/replicate.h               |  2 +-
 lib/runtime/src/ops/reshape.cc                | 96 ++++++++++++-------
 lib/runtime/src/ops/reshape.h                 |  2 +-
 4 files changed, 69 insertions(+), 37 deletions(-)

diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index 0ce07ae88b..ad3451e7ef 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -9,7 +9,7 @@
 
 namespace FlexFlow {
 
-struct ReshapePerDeviceState  {
+struct ReshapePerDeviceState {
   req<DataType> data_type;
 };
 
@@ -21,12 +21,12 @@ namespace Kernels {
 namespace Reshape {
 
 void forward_kernel(ffStream_t stream,
-                    ReshapePerDeviceState const & meta,
+                    ReshapePerDeviceState const &meta,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     ReshapePerDeviceState const & meta,
+                     ReshapePerDeviceState const &meta,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output);
 
diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h
index 083998414e..9880f0991b 100644
--- a/lib/runtime/src/ops/replicate.h
+++ b/lib/runtime/src/ops/replicate.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_REPLICATE_H
 
 #include "op-attrs/ops/replicate.h"
-#include "task_spec/op_task_invocation.h"
 #include "sim_environment.h"
+#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 82b8ae2adc..bdb4cbaf8b 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -47,10 +47,9 @@ bool ReshapeParams::is_valid(ParallelTensorShape const &input) const {
   return input.is_valid();
 }
 
-enum slots {INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
+enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
 
-
-OpTaskInvocation init(ReshapeAttrs const & attrs) {
+OpTaskInvocation init(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind(INPUT, input_tensor(0));
@@ -59,7 +58,7 @@ OpTaskInvocation init(ReshapeAttrs const & attrs) {
   return {RESHAPE_INIT_TASK_ID, binding};
 }
 
-OpTaskInvocation forward(ReshapeAttrs const & attrs) {
+OpTaskInvocation forward(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind(PER_DEVICE_STATE, per_device_op_state<ReshapeMeta>());
@@ -70,17 +69,19 @@ OpTaskInvocation forward(ReshapeAttrs const & attrs) {
   return {RESHAPE_FWD_TASK_ID, binding};
 }
 
-OpTaskInvocation backward(ReshapeAttrs const & attrs) {
+OpTaskInvocation backward(ReshapeAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
   return {RESHAPE_BWD_TASK_ID, binding};
 }
 
-static  DeviceSpecific<ReshapePerDeviceState> init_task_impl(TaskArgumentAccessor const &acc) {
+static DeviceSpecific<ReshapePerDeviceState>
+    init_task_impl(TaskArgumentAccessor const &acc) {
   NOT_IMPLEMENTED();
 }
 
-static  DeviceSpecific<ReshapePerDeviceState> init_task(Task const *task,
+static DeviceSpecific<ReshapePerDeviceState>
+    init_task(Task const *task,
               std::vector<PhysicalRegion> const &regions,
               Context ctx,
               Runtime *runtime) {
@@ -89,7 +90,19 @@ static  DeviceSpecific<ReshapePerDeviceState> init_task(Task const *task,
 }
 
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  NOT_IMPLEMENTED();
+  auto per_device_state =
+      acc.get_argument<DeviceSpecific<ReshapePerDeviceState>>(PER_DEVICE_STATE);
+  Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[Reshape] forward time = %.2lfms\n",
+                 &per_device_state,
+                 input,
+                 output);
 }
 
 static void forward_task(Task const *task,
@@ -101,7 +114,19 @@ static void forward_task(Task const *task,
 }
 
 static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
-  NOT_IMPLEMENTED();
+  auto per_device_state =
+      acc.get_argument<DeviceSpecific<ReshapePerDeviceState>>(PER_DEVICE_STATE);
+  Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  auto input_grad = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output_grad = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reshape] backward time = %.2lfms\n",
+                 &per_device_state,
+                 input_grad,
+                 output_grad);
 }
 
 static void backward_task(Task const *task,
@@ -112,24 +137,29 @@ static void backward_task(Task const *task,
   backward_task_impl(acc);
 }
 
-
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReshapeAttrs const &attrs,
                                   InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
 
-    NOT_IMPLEMENTED();
-  }
+  // reshape has no cost
+  // Note(lamda):if reshape has cost, we can optimize this implementation
+
+  float forward_time = 0.0;
+  float backward_time = 0.0;
+  float sync_time = 0.0;
+  return make_metrics(forward_time, backward_time, sync_time, env);
+}
 
 template <>
 void register_task<RESHAPE_INIT_TASK_ID>() {
-    OpTaskSignature init(OpTaskType::INIT);
+  OpTaskSignature init(OpTaskType::INIT);
 
-    init.add_input_slots(INPUT);
-    init.add_output_slots(OUTPUT);
+  init.add_input_slots(INPUT);
+  init.add_output_slots(OUTPUT);
 
-    register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
+  register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
 }
 
 template <>
@@ -147,7 +177,8 @@ void register_task<RESHAPE_FWD_TASK_ID>() {
 
 template <>
 void register_task<RESHAPE_BWD_TASK_ID>() {
-  OpTaskSignature bwd = infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
+  OpTaskSignature bwd =
+      infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID));
 
   register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task);
 }
@@ -182,7 +213,8 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
 //     std::vector<ParallelTensor> const &inputs) {
 //   std::vector<int> shape;
 //   layer->get_int_vector_property("shape", shape);
-//   return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name);
+//   return new Reshape(model, layer->layer_guid, inputs[0], shape,
+//   layer->name);
 // }
 
 // Reshape::Reshape(FFModel &model,
@@ -280,9 +312,9 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
 // }
 
 // PerDeviceOpState *Reshape::init_task(Task const *task,
-//                                      std::vector<PhysicalRegion> const &regions,
-//                                      Context ctx,
-//                                      Runtime *runtime) {
+//                                      std::vector<PhysicalRegion> const
+//                                      &regions, Context ctx, Runtime *runtime)
+//                                      {
 //   Reshape const *reshape = (Reshape *)task->args;
 //   FFHandler handle = *((FFHandler const *)task->local_args);
 //   ReshapeMeta *m = new ReshapeMeta(handle);
@@ -461,13 +493,13 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
 //   }
 
 //   sim->free_all();
-//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-//   assert(input_ptr != NULL);
-//   cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(),
+//   DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory +=
+//   cost_metrics.total_mem_diff_from(sim->offset);
 
-//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-//   assert(output_ptr != NULL);
-//   cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(),
+//   DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory +=
+//   cost_metrics.total_mem_diff_from(sim->offset);
 
 //   assert(sub_output.get_volume() == sub_input.get_volume());
 //   size_t num_elements = sub_input.get_volume();
@@ -480,7 +512,8 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
 //     float *input_grad_ptr =
 //         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
 //     assert(input_grad_ptr != NULL);
-//     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
+//     cost_metrics.inputs_memory +=
+//     cost_metrics.total_mem_diff_from(sim->offset);
 
 //     float *output_grad_ptr =
 //         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
@@ -497,9 +530,8 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
 
 //   if (sim->computationMode == COMP_MODE_TRAINING) {
 //     printf(
-//         "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n",
-//         name,
-//         cost_metrics.forward_time,
+//         "[Measure Reshape] name(%s) forward_time(%.4lf)
+//         backward_time(%.4lf)\n", name, cost_metrics.forward_time,
 //         cost_metrics.backward_time);
 //   } else {
 //     printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n",
@@ -567,4 +599,4 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
 //   return key;
 // }
 
-}; // namespace std
+}; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reshape.h b/lib/runtime/src/ops/reshape.h
index 85370d9ede..f044e3f057 100644
--- a/lib/runtime/src/ops/reshape.h
+++ b/lib/runtime/src/ops/reshape.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_RESHAPE_H
 
 #include "op-attrs/ops/reshape.h"
-#include "task_spec/op_task_invocation.h"
 #include "sim_environment.h"
+#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 

From 984d6168c9b3a3e9b7a8ee8ade64f2a09cb9790d Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 24 Sep 2023 15:11:02 +0000
Subject: [PATCH 04/16] update the reshape and leave the init_task and
 measure_operator_task

---
 lib/kernels/include/kernels/reshape_kernels.h |  11 +-
 lib/runtime/src/ops/replicate.h               |   2 +-
 lib/runtime/src/ops/reshape.cc                | 438 +-----------------
 lib/runtime/src/tasks.h                       |   2 +-
 4 files changed, 18 insertions(+), 435 deletions(-)

diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index ad3451e7ef..3e6fd46a51 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -1,8 +1,6 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 
-#include "attention_kernels.h"
-#include "datatype_dispatch.h"
 #include "kernels/accessor.h"
 #include "kernels/device.h"
 #include "utils/required_core.h"
@@ -13,20 +11,21 @@ struct ReshapePerDeviceState {
   req<DataType> data_type;
 };
 
-FF_VISITABLE_STRUCT_NO_EQ(ReshapePerDeviceState, data_type);
+FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-ReshapePerDeviceState init_kernel(DataType data_type);
 
 namespace Kernels {
 namespace Reshape {
 
+ReshapePerDeviceState init_kernel(DataType data_type);
+
 void forward_kernel(ffStream_t stream,
-                    ReshapePerDeviceState const &meta,
+                    ReshapePerDeviceState const &per_device_state,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     ReshapePerDeviceState const &meta,
+                     ReshapePerDeviceState const $per_device_state,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output);
 
diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h
index 9880f0991b..361e107b1b 100644
--- a/lib/runtime/src/ops/replicate.h
+++ b/lib/runtime/src/ops/replicate.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_REPLICATE_H
 
 #include "op-attrs/ops/replicate.h"
+#include "op_task_invocation.h"
 #include "sim_environment.h"
-#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index bdb4cbaf8b..f8a8f69de9 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -16,7 +16,7 @@
 #include "reshape.h"
 #include "kernels/reshape_kernels.h"
 #include "legion/legion_utilities.h"
-#include "utils/exception.decl.h"
+#include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
@@ -52,7 +52,7 @@ enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
 OpTaskInvocation init(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
+  binding.bind(INPUT, input_parallel_tensor_shape(0));
   binding.bind(OUTPUT, output_tensor(0));
 
   return {RESHAPE_INIT_TASK_ID, binding};
@@ -61,10 +61,10 @@ OpTaskInvocation init(ReshapeAttrs const &attrs) {
 OpTaskInvocation forward(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(PER_DEVICE_STATE, per_device_op_state<ReshapeMeta>());
-  binding.bind(PROFILING, profiling_settings());
+  binding.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReshapePerDeviceState>());
+  binding.bind_arg(PROFILING, profiling_settings());
 
-  binding.bind(INPUT, input_tensor(0));
+  binding.bind(INPUT, input_parallel_tensor_shape(0));
   binding.bind(OUTPUT, output_tensor(0));
   return {RESHAPE_FWD_TASK_ID, binding};
 }
@@ -91,7 +91,7 @@ static DeviceSpecific<ReshapePerDeviceState>
 
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
-      acc.get_argument<DeviceSpecific<ReshapePerDeviceState>>(PER_DEVICE_STATE);
+         acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -100,7 +100,7 @@ static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(forward_kernel,
                  profiling,
                  "[Reshape] forward time = %.2lfms\n",
-                 &per_device_state,
+                 per_device_state,
                  input,
                  output);
 }
@@ -124,7 +124,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(backward_kernel,
                  profiling,
                  "[Reshape] backward time = %.2lfms\n",
-                 &per_device_state,
+                 per_device_state,
                  input_grad,
                  output_grad);
 }
@@ -156,8 +156,8 @@ template <>
 void register_task<RESHAPE_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
 
-  init.add_input_slots(INPUT);
-  init.add_output_slots(OUTPUT);
+  init.add_input_slot(INPUT);
+  init.add_output_slot(OUTPUT);
 
   register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
 }
@@ -166,7 +166,7 @@ template <>
 void register_task<RESHAPE_FWD_TASK_ID>() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
-  fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<ReshapePerDeviceState>(PER_DEVICE_STATE);
 
   fwd.add_input_slot(INPUT);
@@ -183,420 +183,4 @@ void register_task<RESHAPE_BWD_TASK_ID>() {
   register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task);
 }
 
-// Tensor FFModel::reshape(const Tensor input,
-//                         std::vector<int> const &shape,
-//                         char const *name) {
-//   Layer *reshape = new Layer(this,
-//                              OP_RESHAPE,
-//                              DT_FLOAT,
-//                              name,
-//                              1 /*inputs*/,
-//                              0 /*weights*/,
-//                              1 /*outputs*/,
-//                              input);
-//   int dims[MAX_TENSOR_DIM];
-//   int numdim = shape.size();
-//   for (int i = 0; i < numdim; i++) {
-//     assert(shape[i] > 0);
-//     dims[i] = shape[i];
-//   }
-//   reshape->outputs[0] = create_tensor(
-//       numdim, dims, input->data_type, reshape, 0, true /*create_grad*/);
-//   reshape->add_int_vector_property("shape", shape);
-//   layers.push_back(reshape);
-//   return reshape->outputs[0];
-// }
-
-// Op *Reshape::create_operator_from_layer(
-//     FFModel &model,
-//     Layer const *layer,
-//     std::vector<ParallelTensor> const &inputs) {
-//   std::vector<int> shape;
-//   layer->get_int_vector_property("shape", shape);
-//   return new Reshape(model, layer->layer_guid, inputs[0], shape,
-//   layer->name);
-// }
-
-// Reshape::Reshape(FFModel &model,
-//                  LayerID const &_layer_guid,
-//                  const ParallelTensor input,
-//                  std::vector<int> const &_shape,
-//                  char const *name)
-//     : Op(model,
-//          OP_RESHAPE,
-//          input->data_type,
-//          name,
-//          1 /*inputs*/,
-//          0 /*weights*/,
-//          1 /*outputs*/,
-//          input) {
-//   layer_guid = _layer_guid;
-//   shape_length = _shape.size();
-//   assert(shape_length <= MAX_TENSOR_DIM);
-//   for (int i = 0; i < shape_length; i++) {
-//     shape_array[i] = _shape[i];
-//   }
-//   numOutputs = 1;
-//   numWeights = 0;
-//   int num_replica_dims = 0;
-//   for (int i = 0; i < input->num_dims; i++) {
-//     if (input->dims[i].is_replica_dim) {
-//       num_replica_dims++;
-//     }
-//   }
-//   // assert that all replica dims are leading dims
-//   for (int i = 0; i < num_replica_dims; i++) {
-//     assert(input->dims[input->num_dims - 1 - i].is_replica_dim);
-//   }
-//   int numdim = (int)_shape.size();
-//   ParallelDim dims[MAX_TENSOR_DIM];
-//   for (int i = 0; i < numdim; i++) {
-//     dims[i].size = _shape[numdim - 1 - i];
-//     dims[i].degree = 1;
-//     dims[i].parallel_idx = -1;
-//     dims[i].is_replica_dim = false;
-//   }
-//   // copy all replica dims
-//   for (int i = 0; i < num_replica_dims; i++) {
-//     dims[i + numdim] = input->dims[input->num_dims - 1 - i];
-//   }
-//   numdim += num_replica_dims;
-//   for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) {
-//     if (dims[numdim - 1 - i].size !=
-//         input->dims[input->num_dims - 1 - i].size) {
-//       break;
-//     }
-//     dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i];
-//   }
-//   outputs[0] = model.create_parallel_tensor_legion_ordering(
-//       numdim, dims, input->data_type, this);
-//   assert(outputs[0]->get_volume() == inputs[0]->get_volume());
-// }
-
-// Reshape::Reshape(FFModel &model,
-//                  ReshapeParams const &params,
-//                  const ParallelTensor input,
-//                  char const *name)
-//     : Reshape(model, params.layer_guid, input, params.shape, name) {}
-
-// void Reshape::init(FFModel const &ff) {
-//   assert(check_output_input_weight_same_parallel_is());
-//   parallel_is = outputs[0]->parallel_is;
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_init(ff, argmap);
-//   IndexLauncher launcher(RESHAPE_INIT_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(this, sizeof(Reshape)),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(0, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     WRITE_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region));
-//   launcher.add_field(1, FID_DATA);
-//   FutureMap fm = runtime->execute_index_space(ctx, launcher);
-//   fm.wait_all_results();
-//   set_opmeta_from_futuremap(ff, fm);
-// }
-
-// PerDeviceOpState *Reshape::init_task(Task const *task,
-//                                      std::vector<PhysicalRegion> const
-//                                      &regions, Context ctx, Runtime *runtime)
-//                                      {
-//   Reshape const *reshape = (Reshape *)task->args;
-//   FFHandler handle = *((FFHandler const *)task->local_args);
-//   ReshapeMeta *m = new ReshapeMeta(handle);
-//   m->data_type = reshape->outputs[0]->data_type;
-//   return m;
-// }
-
-// void Reshape::forward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_forward(ff, argmap);
-//   IndexLauncher launcher(RESHAPE_FWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(NULL, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(0, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     WRITE_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region));
-//   launcher.add_field(1, FID_DATA);
-//   runtime->execute_index_space(ctx, launcher);
-// }
-
-// void Reshape::forward_task(Task const *task,
-//                            std::vector<PhysicalRegion> const &regions,
-//                            Context ctx,
-//                            Runtime *runtime) {
-//   assert(regions.size() == 2);
-//   assert(task->regions.size() == 2);
-//   // const Reshape* reshape = (const Reshape*) task->args;
-//   ReshapeMeta const *m = *((ReshapeMeta **)task->local_args);
-//   Domain in_domain = runtime->get_index_space_domain(
-//       ctx, task->regions[0].region.get_index_space());
-//   Domain out_domain = runtime->get_index_space_domain(
-//       ctx, task->regions[1].region.get_index_space());
-//   assert(in_domain.get_volume() == out_domain.get_volume());
-
-//   if (m->data_type == DT_FLOAT) {
-//     float const *in_ptr = helperGetTensorPointerRO<float>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     float *out_ptr = helperGetTensorPointerWO<float>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     forward_kernel_wrapper<float>(in_ptr, out_ptr, in_domain.get_volume());
-//   } else if (m->data_type == DT_DOUBLE) {
-//     double const *in_ptr = helperGetTensorPointerRO<double>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     double *out_ptr = helperGetTensorPointerWO<double>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     forward_kernel_wrapper<double>(in_ptr, out_ptr, in_domain.get_volume());
-//   } else if (m->data_type == DT_INT32) {
-//     int32_t const *in_ptr = helperGetTensorPointerRO<int32_t>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     int32_t *out_ptr = helperGetTensorPointerWO<int32_t>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     forward_kernel_wrapper<int32_t>(in_ptr, out_ptr, in_domain.get_volume());
-//   } else if (m->data_type == DT_INT64) {
-//     int64_t const *in_ptr = helperGetTensorPointerRO<int64_t>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     int64_t *out_ptr = helperGetTensorPointerWO<int64_t>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     forward_kernel_wrapper<int64_t>(in_ptr, out_ptr, in_domain.get_volume());
-//   } else {
-//     assert(false && "Unsupported data type in Reshape forward");
-//   }
-// }
-
-// void Reshape::backward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_backward(ff, argmap);
-//   IndexLauncher launcher(RESHAPE_BWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(NULL, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   // regions[0](I): output_grad
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region_grad));
-//   launcher.add_field(0, FID_DATA);
-//   // regions[3](I/O): input0_grad
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_WRITE,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region_grad));
-//   launcher.add_field(1, FID_DATA);
-//   runtime->execute_index_space(ctx, launcher);
-// }
-
-// ReshapeParams Reshape::get_params() const {
-//   std::vector<int> shape_vec;
-//   for (size_t i = 0; i < shape_length; i++) {
-//     shape_vec.push_back(shape_array[i]);
-//   }
-//   ReshapeParams params;
-//   params.shape = shape_vec;
-//   params.layer_guid = this->layer_guid;
-//   return params;
-// }
-
-// void Reshape::backward_task(Task const *task,
-//                             std::vector<PhysicalRegion> const &regions,
-//                             Context ctx,
-//                             Runtime *runtime) {
-//   assert(regions.size() == 2);
-//   assert(task->regions.size() == 2);
-//   // const Reshape* reshape = (const Reshape*) task->args;
-//   ReshapeMeta const *m = *((ReshapeMeta **)task->local_args);
-//   Domain out_grad_domain = runtime->get_index_space_domain(
-//       ctx, task->regions[0].region.get_index_space());
-//   Domain in_grad_domain = runtime->get_index_space_domain(
-//       ctx, task->regions[1].region.get_index_space());
-//   assert(in_grad_domain.get_volume() == out_grad_domain.get_volume());
-
-//   if (m->data_type == DT_FLOAT) {
-//     float const *out_grad_ptr = helperGetTensorPointerRO<float>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     float *in_grad_ptr = helperGetTensorPointerRW<float>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     backward_kernel_wrapper<float>(
-//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-//   } else if (m->data_type == DT_DOUBLE) {
-//     double const *out_grad_ptr = helperGetTensorPointerRO<double>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     double *in_grad_ptr = helperGetTensorPointerRW<double>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     backward_kernel_wrapper<double>(
-//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-//   } else if (m->data_type == DT_INT32) {
-//     int32_t const *out_grad_ptr = helperGetTensorPointerRO<int32_t>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     int32_t *in_grad_ptr = helperGetTensorPointerRW<int32_t>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     backward_kernel_wrapper<int32_t>(
-//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-//   } else if (m->data_type == DT_INT64) {
-//     int64_t const *out_grad_ptr = helperGetTensorPointerRO<int64_t>(
-//         regions[0], task->regions[0], FID_DATA, ctx, runtime);
-//     int64_t *in_grad_ptr = helperGetTensorPointerRW<int64_t>(
-//         regions[1], task->regions[1], FID_DATA, ctx, runtime);
-//     backward_kernel_wrapper<int64_t>(
-//         in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume());
-//   } else {
-//     assert(false && "Unsupported data type in Reshape backward");
-//   }
-// }
-
-// bool Reshape::measure_operator_cost(Simulator *sim,
-//                                     MachineView const &mv,
-//                                     CostMetrics &cost_metrics) const {
-//   ParallelTensorBase sub_input, sub_output;
-//   if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-//     return false;
-//   }
-//   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-//     return false;
-//   }
-
-//   sim->free_all();
-//   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(),
-//   DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory +=
-//   cost_metrics.total_mem_diff_from(sim->offset);
-
-//   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(),
-//   DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory +=
-//   cost_metrics.total_mem_diff_from(sim->offset);
-
-//   assert(sub_output.get_volume() == sub_input.get_volume());
-//   size_t num_elements = sub_input.get_volume();
-
-//   std::function<void()> forward, backward;
-//   forward = [&] {
-//     forward_kernel_wrapper(input_ptr, output_ptr, num_elements);
-//   };
-//   if (sim->computationMode == COMP_MODE_TRAINING) {
-//     float *input_grad_ptr =
-//         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-//     assert(input_grad_ptr != NULL);
-//     cost_metrics.inputs_memory +=
-//     cost_metrics.total_mem_diff_from(sim->offset);
-
-//     float *output_grad_ptr =
-//         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-//     assert(output_grad_ptr != NULL);
-//     cost_metrics.outputs_memory +=
-//         cost_metrics.total_mem_diff_from(sim->offset);
-
-//     backward = [&] {
-//       backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements);
-//     };
-//   }
-
-//   inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-//   if (sim->computationMode == COMP_MODE_TRAINING) {
-//     printf(
-//         "[Measure Reshape] name(%s) forward_time(%.4lf)
-//         backward_time(%.4lf)\n", name, cost_metrics.forward_time,
-//         cost_metrics.backward_time);
-//   } else {
-//     printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n",
-//            name,
-//            cost_metrics.forward_time);
-//   }
-//   return true;
-// }
-
-// void Reshape::serialize(Legion::Serializer &sez) const {
-//   sez.serialize(this->shape_length);
-//   for (size_t i = 0; i < this->shape_length; i++) {
-//     sez.serialize(this->shape_array[i]);
-//   }
-//   sez.serialize(this->layer_guid.id);
-// }
-
-// using PCG::Node;
-
-// Node Reshape::deserialize(FFModel &ff,
-//                           Legion::Deserializer &dez,
-//                           ParallelTensor inputs[],
-//                           int num_inputs) {
-//   assert(num_inputs == 1);
-//   size_t shape_length;
-//   std::vector<int> shape;
-//   dez.deserialize(shape_length);
-//   for (size_t i = 0; i < shape_length; i++) {
-//     int value;
-//     dez.deserialize(value);
-//     shape.push_back(value);
-//   }
-//   size_t id;
-//   dez.deserialize(id);
-//   LayerID layer_guid(id);
-
-//   ReshapeParams params;
-//   params.shape = shape;
-//   params.layer_guid = layer_guid;
-//   return ff.get_or_create_node<Reshape>(inputs[0], params);
-// }
-
-// Op *Reshape::materialize(FFModel &ff,
-//                          ParallelTensor inputs[],
-//                          int num_inputs) const {
-//   assert(num_inputs == 1);
-//   std::vector<int> shape;
-//   for (size_t i = 0; i < this->shape_length; i++) {
-//     shape.push_back(shape_array[i]);
-//   }
-//   return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name);
-// }
-
-// }; // namespace FlexFlow
-
-// namespace std {
-// size_t hash<FlexFlow::ReshapeParams>::operator()(
-//     FlexFlow::ReshapeParams const &params) const {
-//   size_t key = 0;
-//   hash_combine(key, params.shape.size());
-//   for (int n : params.shape) {
-//     hash_combine(key, n);
-//   }
-//   hash_combine(key, params.layer_guid.id);
-//   return key;
-// }
-
 }; // namespace FlexFlow
diff --git a/lib/runtime/src/tasks.h b/lib/runtime/src/tasks.h
index 5d36502f6f..5c1dcb0698 100644
--- a/lib/runtime/src/tasks.h
+++ b/lib/runtime/src/tasks.h
@@ -73,7 +73,7 @@ enum task_id_t {
   REDUCE_INIT_TASK_ID,
   REDUCE_FWD_TASK_ID,
   REDUCE_BWD_TASK_ID,
-  _INIT_TASK_IDRESHAPE,
+  RESHAPE_INIT_TASK_ID,
   RESHAPE_FWD_TASK_ID,
   RESHAPE_BWD_TASK_ID,
   REVERSE_INIT_TASK_ID,

From 8b0a0270dfd9e7b0404c6e5c022a74d17efdac1c Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 24 Sep 2023 19:25:52 +0000
Subject: [PATCH 05/16] refine the reshape pr

---
 lib/kernels/include/kernels/reshape_kernels.h |  1 -
 lib/runtime/src/ops/reshape.cc                | 38 +++++++++++++++----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index 3e6fd46a51..57296fb4aa 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -13,7 +13,6 @@ struct ReshapePerDeviceState {
 
 FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-
 namespace Kernels {
 namespace Reshape {
 
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index f8a8f69de9..d2b6028f57 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -54,6 +54,7 @@ OpTaskInvocation init(ReshapeAttrs const &attrs) {
 
   binding.bind(INPUT, input_parallel_tensor_shape(0));
   binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(ATTRS, attrs);
 
   return {RESHAPE_INIT_TASK_ID, binding};
 }
@@ -61,7 +62,8 @@ OpTaskInvocation init(ReshapeAttrs const &attrs) {
 OpTaskInvocation forward(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReshapePerDeviceState>());
+  binding.bind_arg(PER_DEVICE_STATE,
+                   per_device_op_state<ReshapePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
 
   binding.bind(INPUT, input_parallel_tensor_shape(0));
@@ -77,7 +79,12 @@ OpTaskInvocation backward(ReshapeAttrs const &attrs) {
 
 static DeviceSpecific<ReshapePerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
-  NOT_IMPLEMENTED();
+  auto attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
+
+  DeviceSpecific<TopKPerDeviceState> per_device_state =
+      acc.create_device_specific<ReshapeAttrs>(
+          init_kernel(attrs.shape.data_type));
+  return per_device_state;
 }
 
 static DeviceSpecific<ReshapePerDeviceState>
@@ -91,7 +98,7 @@ static DeviceSpecific<ReshapePerDeviceState>
 
 static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
-         acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -143,12 +150,29 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
 
-  // reshape has no cost
   // Note(lamda):if reshape has cost, we can optimize this implementation
 
-  float forward_time = 0.0;
-  float backward_time = 0.0;
-  float sync_time = 0.0;
+  SimTaskBinding init_binding;
+  init_binding.bind_arg(ATTRS, attrs);
+  auto init_accessor =
+      env.get_init_accessor(RESHAPE_INIT_TASK_ID, init_binding);
+  auto per_device_state = init_task_impl(init_accessor);
+
+  SimTaskBinding fwd_binding;
+  fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
+  fwd_binding.bind_arg(PROFILING, settings);
+  fwd_binding.bind(INPUT, input_parallel_tensor_shape(0));
+  fwd_binding.bind(OUTPUT, output_tensor(0));
+
+  SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+
+  auto fwd_accessor = env.get_fwd_accessor(TOPK_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(TOPK_BWD_TASK_ID, bwd_binding);
+
+  float forward_time = forward_task_impl(fwd_accessor).value();
+  float backward_time = backward_task_impl(bwd_accessor).value();
+
+  float sync_time = default_estimate_sync_time(env);
   return make_metrics(forward_time, backward_time, sync_time, env);
 }
 

From 3047c75c384f00a1bc9e7069789cafb496701be8 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 24 Sep 2023 19:27:30 +0000
Subject: [PATCH 06/16] remove the replicate

---
 lib/runtime/src/ops/replicate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h
index 361e107b1b..fd5ffd9ef9 100644
--- a/lib/runtime/src/ops/replicate.h
+++ b/lib/runtime/src/ops/replicate.h
@@ -20,7 +20,7 @@ OpTaskInvocation backward(ReplicateAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReplicateAttrs const &attrs,
-                                  InputParallelTensorDesc const &input,
+                                  ParallelTensorShape const &input_shape,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);
 

From ea161cd63f41761e262ba1a294b7a58b1e2a53dc Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Sun, 24 Sep 2023 21:10:41 +0000
Subject: [PATCH 07/16] fix error in reshape

---
 lib/runtime/src/ops/reshape.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index d2b6028f57..1c0e4477ef 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -81,8 +81,8 @@ static DeviceSpecific<ReshapePerDeviceState>
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
 
-  DeviceSpecific<TopKPerDeviceState> per_device_state =
-      acc.create_device_specific<ReshapeAttrs>(
+  DeviceSpecific<ReshapePerDeviceState> per_device_state =
+      acc.create_device_specific<ReshapePerDeviceState>(
           init_kernel(attrs.shape.data_type));
   return per_device_state;
 }
@@ -122,7 +122,7 @@ static void forward_task(Task const *task,
 
 static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
-      acc.get_argument<DeviceSpecific<ReshapePerDeviceState>>(PER_DEVICE_STATE);
+      acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input_grad = acc.get_tensor<Permissions::RO>(INPUT);

From 63cc83dd409ef929010c2161dacf29071c5d72d7 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 26 Sep 2023 13:46:17 +0000
Subject: [PATCH 08/16] fix the type error

---
 lib/kernels/include/kernels/reshape_kernels.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index 57296fb4aa..6be4012073 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -24,7 +24,7 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     ReshapePerDeviceState const $per_device_state,
+                     ReshapePerDeviceState const &per_device_state,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output);
 

From 59166071d31707e62664af61d6bca1a588c168f3 Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 27 Sep 2023 20:04:17 +0000
Subject: [PATCH 09/16] add init_kernel for reshape

---
 lib/kernels/src/cuda/reshape_kernels.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu
index 15ae0c7109..554ab9fa6b 100644
--- a/lib/kernels/src/cuda/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/reshape_kernels.cu
@@ -19,8 +19,9 @@
 
 namespace FlexFlow {
 
-ReshapePerDeviceState::ReshapePerDeviceState(FFHandler handler)
-    : PerDeviceOpState(handler) {}
+ReshapePerDeviceState init_kernel(DataType data_type) {
+  return ReshapePerDeviceState{data_type};
+}
 
 namespace Kernels {
 namespace Reshape {

From 171f05fbf037f272d06678b773a4694bc0062c8d Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 3 Oct 2023 15:08:46 +0000
Subject: [PATCH 10/16] modify the reshpae by fixing the typo error

---
 lib/runtime/src/ops/reshape.cc | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 1c0e4477ef..3e57da3c97 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -38,22 +38,11 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reshape;
 
-/* Params */
-bool operator==(ReshapeParams const &lhs, ReshapeParams const &rhs) {
-  return lhs.shape == rhs.shape;
-}
-
-bool ReshapeParams::is_valid(ParallelTensorShape const &input) const {
-  return input.is_valid();
-}
-
 enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
 
 OpTaskInvocation init(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_parallel_tensor_shape(0));
-  binding.bind(OUTPUT, output_tensor(0));
   binding.bind_arg(ATTRS, attrs);
 
   return {RESHAPE_INIT_TASK_ID, binding};
@@ -150,8 +139,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view) {
 
-  // Note(lamda):if reshape has cost, we can optimize this implementation
-
   SimTaskBinding init_binding;
   init_binding.bind_arg(ATTRS, attrs);
   auto init_accessor =
@@ -166,8 +153,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
-  auto fwd_accessor = env.get_fwd_accessor(TOPK_FWD_TASK_ID, fwd_binding);
-  auto bwd_accessor = env.get_bwd_accessor(TOPK_BWD_TASK_ID, bwd_binding);
+  auto fwd_accessor = env.get_fwd_accessor(RESHAPE_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(RESHAPE_BWD_TASK_ID, bwd_binding);
 
   float forward_time = forward_task_impl(fwd_accessor).value();
   float backward_time = backward_task_impl(bwd_accessor).value();

From 143d496367d8a81c3167773ab00b3a5ce1f3a23a Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 10 Oct 2023 20:08:21 +0000
Subject: [PATCH 11/16] refine the reshape

---
 lib/runtime/src/ops/reshape.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 3e57da3c97..0cbb919529 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -55,7 +55,7 @@ OpTaskInvocation forward(ReshapeAttrs const &attrs) {
                    per_device_op_state<ReshapePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
 
-  binding.bind(INPUT, input_parallel_tensor_shape(0));
+  binding.bind(INPUT, input_tensor(0));
   binding.bind(OUTPUT, output_tensor(0));
   return {RESHAPE_FWD_TASK_ID, binding};
 }
@@ -146,10 +146,11 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
   auto per_device_state = init_task_impl(init_accessor);
 
   SimTaskBinding fwd_binding;
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
   fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state);
   fwd_binding.bind_arg(PROFILING, settings);
-  fwd_binding.bind(INPUT, input_parallel_tensor_shape(0));
-  fwd_binding.bind(OUTPUT, output_tensor(0));
+  fwd_binding.bind(INPUT, input.shape);
+  fwd_binding.bind(OUTPUT, output_shape);
 
   SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
 
@@ -167,8 +168,9 @@ template <>
 void register_task<RESHAPE_INIT_TASK_ID>() {
   OpTaskSignature init(OpTaskType::INIT);
 
-  init.add_input_slot(INPUT);
-  init.add_output_slot(OUTPUT);
+  init.add_arg_slot<ReshapeAttrs>(ATTRS);
+
+  init.add_return_value<ReshapePerDeviceState>(PER_DEVICE_STATE);
 
   register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task);
 }

From 2f798d59ff66a7ea3f4661ea5f9d8e114ca9a0bf Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 21:13:06 +0000
Subject: [PATCH 12/16] reshape

---
 lib/kernels/src/cuda/reshape_kernels.cu | 4 ++--
 lib/runtime/src/ops/reshape.cc          | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu
index 554ab9fa6b..941d431a7c 100644
--- a/lib/kernels/src/cuda/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/reshape_kernels.cu
@@ -55,14 +55,14 @@ struct BackwardKernel {
 }
 
 void forward_kernel(cudaStream_t stream,
-                    ReshapePerDeviceState const *m,
+                    ReshapePerDeviceState const  &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output) {
   DataTypeDispatch1<ForwardKernel>{}(m->data_type, stream, m, input, output);
 }
 
 void backward_kernel(cudaStream_t stream,
-                     ReshapePerDeviceState const *m,
+                     ReshapePerDeviceState const &m,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output) {
   DataTypeDispatch1<BackwardKernel>{}(m->data_type, stream, m, input, output);
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 0cbb919529..66c6fae0bd 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -16,8 +16,6 @@
 #include "reshape.h"
 #include "kernels/reshape_kernels.h"
 #include "legion/legion_utilities.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -114,8 +112,8 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto input_grad = acc.get_tensor<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,

From 803f0c041581696be35154a39d62b5e7cd19580b Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Tue, 17 Oct 2023 21:25:34 +0000
Subject: [PATCH 13/16] format the code

---
 lib/kernels/src/cuda/reshape_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu
index 941d431a7c..49b6a1f5ba 100644
--- a/lib/kernels/src/cuda/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/reshape_kernels.cu
@@ -55,7 +55,7 @@ struct BackwardKernel {
 }
 
 void forward_kernel(cudaStream_t stream,
-                    ReshapePerDeviceState const  &m,
+                    ReshapePerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output) {
   DataTypeDispatch1<ForwardKernel>{}(m->data_type, stream, m, input, output);

From d9db77703614ac05124d9fddf08a81ee67c9935a Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Wed, 18 Oct 2023 23:29:21 +0000
Subject: [PATCH 14/16] fix the error

---
 lib/kernels/src/cuda/reshape_kernels.cu | 5 ++---
 lib/runtime/src/ops/reshape.cc          | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu
index 49b6a1f5ba..b52822a7fc 100644
--- a/lib/kernels/src/cuda/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/reshape_kernels.cu
@@ -42,7 +42,6 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  ReshapePerDeviceState const *m,
                   GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output) {
     float alpha = 1.0f;
@@ -58,14 +57,14 @@ void forward_kernel(cudaStream_t stream,
                     ReshapePerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(m->data_type, stream, m, input, output);
+  DataTypeDispatch1<ForwardKernel>{}(m.data_type, stream, m, input, output);
 }
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m->data_type, stream, m, input, output);
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, m, input, output);
 }
 
 } // namespace Reshape
diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 66c6fae0bd..794a36ef62 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -112,8 +112,8 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,

From 2e3c715f8c16aa9de4dae4ef3db8e12964780a8e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Thu, 19 Oct 2023 19:29:58 +0000
Subject: [PATCH 15/16] format the code

---
 lib/runtime/src/ops/reshape.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc
index 794a36ef62..c9dc8cff8d 100644
--- a/lib/runtime/src/ops/reshape.cc
+++ b/lib/runtime/src/ops/reshape.cc
@@ -112,7 +112,7 @@ static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
       acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   Profiling profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,

From 33ed1638b419b848c1653e13d087c4b0e6a40a6e Mon Sep 17 00:00:00 2001
From: lambda7xx <lambda7xx@gmail.com>
Date: Thu, 19 Oct 2023 21:14:04 +0000
Subject: [PATCH 16/16] format the code

---
 lib/kernels/src/cuda/reshape_kernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu
index b52822a7fc..e935b0d0c2 100644
--- a/lib/kernels/src/cuda/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/reshape_kernels.cu
@@ -57,14 +57,14 @@ void forward_kernel(cudaStream_t stream,
                     ReshapePerDeviceState const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(m.data_type, stream, m, input, output);
+  DataTypeDispatch1<ForwardKernel>{}(m.data_type, stream, input, output);
 }
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
                      GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, m, input, output);
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, input, output);
 }
 
 } // namespace Reshape