diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index aeea932f04..1220a93203 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -5,30 +5,37 @@
 
 namespace FlexFlow {
 
-class ReducePerDeviceState : public PerDeviceOpState {
-public:
-  ReducePerDeviceState(FFHandler handler,
-                       Reduce const *rd,
-                       Legion::Domain const &input_domain);
-  ~ReducePerDeviceState(void);
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-  cudnnTensorDescriptor_t inputTensor, outputTensor;
-  cudnnReduceTensorDescriptor_t reduceDesc;
-#else
-  miopenTensorDescriptor_t inputTensor, outputTensor;
-  miopenReduceTensorDescriptor_t reduceDesc;
-#endif
+struct ReducePerDeviceState {
+  PerDeviceFFHandle handle;
+  ffTensorDescriptor_t inputTensor;
+  ffTensorDescriptor_t outputTensor;
+  ffReduceTensorDescriptor_t reduceDesc;
   OperatorType op_type;
   size_t reduction_size;
 };
 
+FF_VISITABLE_STRUCT(ReducePerDeviceState,
+                    handle,
+                    inputTensor,
+                    outputTensor,
+                    reduceDesc,
+                    op_type,
+                    reduction_size);
+
 namespace Kernels {
 namespace Reduce {
-void forward_kernel_wrapper(ReducePerDeviceState const *m,
+
+ReducePerDeviceState init_kernel(PerDeviceFFhandle const &,
+                                 OperatorType const &,
+                                 size_t const &,
+                                 ArrayShape input_shape,
+                                 ArrayShape output_shape);
+
+void forward_kernel_wrapper(ReducePerDeviceState const &m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
 
-void backward_kernel_wrapper(ReducePerDeviceState const *m,
+void backward_kernel_wrapper(ReducePerDeviceState const &m,
                              GenericTensorAccessorR const &output_grad,
                              GenericTensorAccessorW const &input_grad);
 
diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu
index d675f88073..a67f52999d 100644
--- a/lib/kernels/src/cuda/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/reduce_kernels.cu
@@ -67,50 +67,73 @@ ReducePerDeviceState::~ReducePerDeviceState(void) {
 namespace Kernels {
 namespace Reduce {
 
+ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle,
+                                 OperatorType const &op_type,
+                                 size_t const &reduction_size,
+                                 ArrayShape const &input_shape,
+                                 ArrayShape const &output_shape) {
+
+  ffTensorDescriptor_t inputTensor;
+  ffTensorDescriptor_t outputTensor;
+  ffReduceTensorDescriptor_t reduceDesc;
+
+  checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
+  checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+  ;
+  checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
+
+  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape));
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
+
+  ReducePerDeviceState per_device = {
+      handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size};
+}
+
 void forward_kernel(cudaStream_t stream,
-                    ReducePerDeviceState const *m,
+                    ReducePerDeviceState const &m,
                     float const *input_ptr,
                     float *output_ptr) {
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0f, beta = 0.0f;
-  checkCUDNN(cudnnReduceTensor(m->handle.dnn,
-                               m->reduceDesc,
+  checkCUDNN(cudnnReduceTensor(m.handle.dnn,
+                               m.reduceDesc,
                                nullptr /*indices*/,
                                0 /*indicesSizeInBytes*/,
-                               m->handle.workSpace,
-                               m->handle.workSpaceSize,
+                               m.handle.workSpace,
+                               m.handle.workSpaceSize,
                                &alpha,
-                               m->inputTensor,
+                               m.inputTensor,
                                input_ptr,
                                &beta,
-                               m->outputTensor,
+                               m.outputTensor,
                                output_ptr));
 };
 
 void backward_kernel(cudaStream_t stream,
-                     ReducePerDeviceState const *m,
+                     ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr) {
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0, beta = 1.0f;
-  switch (m->op_type) {
+  switch (m.op_type) {
     case OP_REDUCE_SUM:
       alpha = 1.0f;
       break;
     case OP_REDUCE_MEAN:
       // When the output is the average of multiple input elements
       // we need to scale the gradients by 1.0 / reduction_size
-      alpha = 1.0f / m->reduction_size;
+      alpha = 1.0f / m.reduction_size;
       break;
     default:
       assert(false);
   }
-  checkCUDNN(cudnnAddTensor(m->handle.dnn,
+  checkCUDNN(cudnnAddTensor(m.handle.dnn,
                             &alpha,
-                            m->outputTensor,
+                            m.outputTensor,
                             output_grad_ptr,
                             &beta,
-                            m->inputTensor,
+                            m.inputTensor,
                             input_grad_ptr));
 }
 
diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc
index 8879cad592..2674dc4fef 100644
--- a/lib/runtime/src/ops/reduce.cc
+++ b/lib/runtime/src/ops/reduce.cc
@@ -1,7 +1,10 @@
 #include "reduce.h"
 #include "kernels/reduce_kernels.h"
 #include "legion/legion_utilities.h"
+#include "op-attrs/get_output_shape.h"
+#include "utils/exceptions.h"
 #include "utils/hash-utils.h"
+#include "utils/type_traits_core.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -22,414 +25,185 @@ using Legion::TaskLauncher;
 
 using namespace FlexFlow::Kernels::Reduce;
 
-bool operator==(ReduceParams const &lhs, ReduceParams const &rhs) {
-  return (lhs.axes == rhs.axes) && (lhs.keepdims == rhs.keepdims);
-}
+enum Slots {
+  INPUT,
+  OUTPUT,
+  ATTRS,
+  PROFILING,
+  REDUCE,
+  PER_DEVICE_STATE,
+  HANDLE
+};
+
+OpTaskInvocation init(TransposeAttrs const &attrs) {
+  OpTaskBinding binding;
+
+  binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(ATTRS, attrs);
 
-bool ReduceParams::is_valid(ParallelTensorShape const &input) const {
-  for (size_t i = 0; i < axes.size(); i++) {
-    if (axes[i] >= input.num_dims) {
-      return false;
-    }
-  }
-  return input.is_valid();
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
+
+  return {REDUCE_INIT_TASK_ID, binding};
 }
 
-ReduceParams Reduce::get_params() const {
-  ReduceParams params;
-  params.axes.clear();
-  for (int i = 0; i < num_axes; i++) {
-    params.axes.push_back(this->axes[i]);
-  }
-  params.keepdims = keepdims;
-  return params;
+static DeviceSpecific<ReducePerDeviceState>
+    init_task_impl(TaskArgumentAccessor const &acc) {
+  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  OperatorType = attrs.op_type;
+  // Note: How to set the reduction size?
+  size_t reduction_size = input.shape.get_volume() / output.shape.get_volume();
+  DeviceSpecific<ReducePerDeviceState> per_device_state =
+      acc.create_device_specific<ReducePerDeviceState>(init_kernel(
+          handle, op_type, reduction_size, input.shape, output.shape));
+  return per_device_state;
 }
 
-Tensor FFModel::reduce_sum(OperatorType op,
-                           const Tensor input,
-                           std::vector<int> const &_axes,
-                           bool keepdims,
-                           char const *name) {
-  Layer *rd = new Layer(this,
-                        op,
-                        DT_FLOAT,
-                        name,
-                        1 /*input*/,
-                        0 /*weights*/,
-                        1 /*outputs*/,
-                        input);
-  // Use Legion indexing to store axes
-  std::vector<int> axes;
-  for (size_t i = 0; i < _axes.size(); i++) {
-    axes.push_back(input->num_dims - 1 - _axes[i]);
-  }
-  int dims[MAX_TENSOR_DIM];
-  int numdim = input->num_dims;
-  if (keepdims) {
-    for (int i = 0; i < input->num_dims; i++) {
-      dims[i] = input->dims[i];
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      dims[axes[i]] = 1;
-    }
-  } else {
-    numdim = 0;
-    for (int i = 0; i < input->num_dims; i++) {
-      bool reduced = false;
-      for (size_t j = 0; j < axes.size(); j++) {
-        if (axes[j] == i) {
-          reduced = true;
-        }
-      }
-      if (!reduced) {
-        dims[numdim++] = input->dims[i];
-      }
-    }
-    assert(numdim + axes.size() == input->num_dims);
-  }
-  rd->outputs[0] = create_tensor_legion_ordering(
-      numdim, dims, input->data_type, rd, 0, true /*create_grad*/);
-  rd->add_int_vector_property("legion_axes", axes);
-  rd->add_int_property("keepdims", keepdims);
-  layers.push_back(rd);
-  return rd->outputs[0];
+static DeviceSpecific<TransposePerDeviceState>
+    init_task(Task const *task,
+              std::vector<PhysicalRegion> const &regions,
+              Context ctx,
+              Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  return init_task_impl(acc);
 }
 
-Tensor FFModel::reduce_sum(const Tensor input,
-                           std::vector<int> const &_axes,
-                           bool keepdims,
-                           char const *name) {
-  return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name);
+template <>
+void register_task<TRANSPOSE_INIT_TASK_ID>() {
+  OpTaskSignature init(OpTaskType::INIT)
+
+      init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<ReduceAttrs>(ATTRS);
+
+  init.add_return_value<ReducePerDeviceState>();
+
+  register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task);
 }
 
-Tensor FFModel::reduce_mean(const Tensor input,
-                            std::vector<int> const &_axes,
-                            bool keepdims,
-                            char const *name) {
-  return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name);
+// Note: forward_kernel only needs ReducePerDeviceState, input, output
+OpTaskInvocation forward(ReduceAttrs const &attrs) {
+  OpTaskBinding binding;
+
+  bind.bind_arg(PER_DEVICE_STATE, per_device_op_state<ReducePerDeviceState>());
+  bind.bind_arg(PROFILING, profiling_tensor());
+
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
+
+  return {REDUCE_FWD_TASK_ID, binding};
 }
 
-Op *Reduce::create_operator_from_layer(
-    FFModel &model,
-    Layer const *layer,
-    std::vector<ParallelTensor> const &inputs) {
-  std::vector<int> axes;
-  long long value;
-  layer->get_int_vector_property("legion_axes", axes);
-  layer->get_int_property("keepdims", value);
-  bool keepdims = value;
-  return new Reduce(
-      model, layer->op_type, inputs[0], axes, keepdims, layer->name);
+static optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  auto per_device_state =
+      acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  return profile(forward_kernel,
+                 profiling,
+                 "[Reduce] forward_time = %.2lfms\n",
+                 per_device_state,
+                 input.get_float_ptr(),
+                 output.get_float_ptr());
 }
 
-Reduce::Reduce(FFModel &model,
-               ReduceParams const &params,
-               const ParallelTensor input,
-               char const *name)
-    : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) {
+static void forward_task(Task const *task,
+                         std::vector<PhysicalRegion> const &regions,
+                         Context ctx,
+                         Runtime *runtime) {
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  forward_task_impl(acc);
 }
 
-Reduce::Reduce(FFModel &model,
-               OperatorType _op_type,
-               const ParallelTensor input,
-               std::vector<int> const &_axes,
-               bool _keepdims,
-               char const *name)
-    : Op(model,
-         _op_type,
-         input->data_type,
-         name,
-         1 /*inputs*/,
-         0 /*weights*/,
-         1 /*outputs*/,
-         input),
-      num_axes(_axes.size()), keepdims(_keepdims) {
-  for (size_t i = 0; i < num_axes; i++) {
-    axes[i] = _axes[i];
-  }
-  int num_dims = input->num_dims;
-  ParallelDim dims[MAX_TENSOR_DIM];
-  if (keepdims) {
-    num_dims = input->num_dims;
-    for (int i = 0; i < num_dims; i++) {
-      dims[i] = input->dims[i];
-    }
-    for (int i = 0; i < num_axes; i++) {
-      // Currently assume that we cannot parallelize along reduced dims
-      assert(dims[axes[i]].degree == 1);
-      dims[axes[i]].size = 1;
-    }
-  } else {
-    num_dims = 0;
-    for (int i = 0; i < input->num_dims; i++) {
-      bool reduced = false;
-      for (int j = 0; j < num_axes; j++) {
-        if (axes[j] == i) {
-          reduced = true;
-        }
-      }
-      if (!reduced) {
-        dims[num_dims++] = input->dims[i];
-      } else {
-        // Currently assume that we cannot parallelize along reduced dims
-        assert(input->dims[i].degree == 1);
-        assert(input->dims[i].parallel_idx == -1);
-      }
-    }
-  }
-  outputs[0] = model.create_parallel_tensor_legion_ordering(
-      num_dims, dims, input->data_type, this);
+template <>
+void register_task<REDUCE_FWD_TASK_ID>() {
+  OpTaskSignature fwd(OpTaskType::FORWARD);
+
+  fwd.add_unchecked_arg_slot<PerDeviceOpState>(PER_DEVICE_STATE);
+  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_output_slot(OUTPUT);
+
+  register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task);
 }
 
-void Reduce::init(FFModel const &ff) {
-  assert(check_output_input_weight_same_parallel_is());
-  parallel_is = outputs[0]->parallel_is;
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_init(ff, argmap);
-  IndexLauncher launcher(REDUCE_INIT_TASK_ID,
-                         parallel_is,
-                         TaskArgument(this, sizeof(Reduce)),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  FutureMap fm = runtime->execute_index_space(ctx, launcher);
-  fm.wait_all_results();
-  set_opmeta_from_futuremap(ff, fm);
-};
+OpTaskInvocation backward(ReduceAttrs const &attrs) {
+  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-PerDeviceOpState *Reduce::init_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
-  Reduce *rd = (Reduce *)task->args;
-  FFHandler handle = *((FFHandler *)task->local_args);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  ReduceMeta *m = new ReduceMeta(handle, rd, input.domain);
-  return m;
+  return {REDUCE_BWD_TASK_ID, binding};
 }
 
-void Reduce::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(REDUCE_FWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, false),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+static optional<float> backward_task_impl(TaskArgumentAccessor const &acc) {
+  auto per_device_state =
+      acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+
+  return profile(backward_kernel,
+                 profiling,
+                 "[Reduce] backward_time = %.2lfms\n",
+                 per_device_state,
+                 input.get_float_ptr(),
+                 output.get_float_ptr());
 }
 
-void Reduce::forward_task(Task const *task,
+static void backward_task(Task const *task,
                           std::vector<PhysicalRegion> const &regions,
                           Context ctx,
                           Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  ReduceMeta const *m = *((ReduceMeta **)task->local_args);
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  forward_kernel_wrapper(m, input, output);
+  TaskArgumentAccessor acc(task, regions, ctx, runtime);
+  backward_task_impl(acc);
 }
 
-void Reduce::backward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  set_argumentmap_for_backward(ff, argmap);
-  IndexLauncher launcher(REDUCE_BWD_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  // regions[0](I): output_grad
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
-  launcher.add_field(0, FID_DATA);
-  // regions[1](I/O): input_grad
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
-}
+template <>
+void register_task<REDUCE_BWD_TASK_ID>() {
+  OpTaskSignature bwd =
+      infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID));
 
-void Reduce::backward_task(Task const *task,
-                           std::vector<PhysicalRegion> const &regions,
-                           Context ctx,
-                           Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  ReduceMeta const *m = *((ReduceMeta **)task->local_args);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  backward_kernel_wrapper(m, output_grad, input_grad);
+  reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task);
 }
 
-bool Reduce::measure_operator_cost(Simulator *sim,
-                                   MachineView const &mv,
-                                   CostMetrics &cost_metrics) const {
-  ParallelTensorBase sub_input, sub_output;
-  if (!outputs[0]->get_sub_tensor(mv, sub_output)) {
-    return false;
-  }
-  if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
-    return false;
-  }
-  ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain());
-  sim->free_all();
-  float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-  assert(input_ptr != NULL);
-  cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorR input_acc(
-      inputs[0]->data_type, sub_input.get_domain(), input_ptr);
-
-  float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-  assert(output_ptr != NULL);
-  cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-  GenericTensorAccessorW output_acc(
-      outputs[0]->data_type, sub_output.get_domain(), output_ptr);
-
-  assert(m->profiling == false);
-
-  std::function<void()> forward, backward;
-  forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    float *input_grad_ptr =
-        (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
-    assert(input_grad_ptr != NULL);
-    cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
-    GenericTensorAccessorW input_grad_acc(
-        inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr);
-
-    float *output_grad_ptr =
-        (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
-    assert(output_grad_ptr != NULL);
-    cost_metrics.outputs_memory +=
-        cost_metrics.total_mem_diff_from(sim->offset);
-    GenericTensorAccessorR output_grad_acc(
-        outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr);
-
-    backward = [&] {
-      backward_kernel_wrapper(m, output_grad_acc, input_grad_acc);
-    };
-  }
-
-  inner_measure_operator_cost(sim, forward, backward, cost_metrics);
-
-  if (sim->computationMode == COMP_MODE_TRAINING) {
-    printf("[Measure Reduce] name(%s) forward_time(%.4lf) "
-           "backward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time,
-           cost_metrics.backward_time);
-  } else {
-    printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n",
-           name,
-           cost_metrics.forward_time);
-  }
-
-  return true;
-}
+CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
+                                  ReduceAttrs const &attrs,
+                                  InputParallelTensorDesc const &input,
+                                  ProfilingSettings const &settings,
+                                  MachineView const &machine_view) {
+  auto env = sim.new_environment();
 
-void Reduce::serialize(Legion::Serializer &sez) const {
-  ReduceParams params = get_params();
-  sez.serialize(params.op_type);
-  sez.serialize(params.axes.size());
-  for (size_t i = 0; i < params.axes.size(); i++) {
-    sez.serialize(params.axes[i]);
-  }
-  sez.serialize(params.keepdims);
-}
+  SimTaskBinding init_binding;
+  init_binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(HANDLE, ff_handle());
 
-using PCG::Node;
-Node Reduce::deserialize(FFModel &ff,
-                         Legion::Deserializer &dez,
-                         ParallelTensor inputs[],
-                         int num_inputs) {
-  assert(num_inputs == 1);
-  OperatorType op_type;
-  size_t axes_size;
-  bool keepdims;
-  std::vector<int> axes;
-  dez.deserialize(op_type);
-  dez.deserialize(axes_size);
-  for (size_t i = 0; i < axes_size; i++) {
-    int dim_idx;
-    dez.deserialize(dim_idx);
-    axes.push_back(dim_idx);
-  }
-  dez.deserialize(keepdims);
-  return ff.get_or_create_node<Reduce>(inputs[0], {axes, op_type, keepdims});
-}
+  auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding);
+  DeviceSpecific<ReducePerDeviceState> per_device_state =
+      init_task_impl(init_accessor);
 
-Op *Reduce::materialize(FFModel &ff,
-                        ParallelTensor inputs[],
-                        int num_inputs) const {
-  ReduceParams params = get_params();
-  return new Reduce(ff, params, inputs[0], this->name);
-}
+  SimTaskBinding fwd_binding;
+  ParallelTensorShape output_shape = get_output_shape(attrs, input.shape);
+  fwd.bind(INPUT, input.shape);
+  fwd.bind(OUTPUT, output_shape);
+  fwd.bind_arg(PROFILING, settings);
+  fwd.bind_arg(PER_DEVICE_STATE, per_device_state);
 
-}; // namespace FlexFlow
+  SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding);
+
+  auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding);
+  auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding);
 
-namespace std {
-size_t hash<FlexFlow::ReduceParams>::operator()(
-    FlexFlow::ReduceParams const &params) const {
-  size_t key = 0;
-  hash_combine(key, params.op_type);
-  hash_combine(key, params.axes.size());
-  for (int n : params.axes) {
-    hash_combine(key, n);
-  }
-  hash_combine(key, params.keepdims);
-  return key;
+  float forward_time = forward_task_impl(fwd_accessor).value();
+  float backward_time = backward_task_impl(bwd_accessor).value();
+
+  float sync_time = default_estimate_sync_time(env);
+  return make_metrics(forward_time, backward_time, sync_time, env);
 }
-}; // namespace std
+
+}; // namespace FlexFlow
diff --git a/lib/runtime/src/ops/reduce.h b/lib/runtime/src/ops/reduce.h
index 52b4eff609..099083ed67 100644
--- a/lib/runtime/src/ops/reduce.h
+++ b/lib/runtime/src/ops/reduce.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
 #include "op-attrs/ops/reduce.h"
-#include "op_task_invocation.h"
 #include "sim_environment.h"
+#include "task_spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
@@ -20,7 +20,7 @@ OpTaskInvocation backward(ReduceAttrs const &);
 
 CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory,
                                   ReduceAttrs const &attrs,
-                                  ParallelTensorShape const &input_shape,
+                                  InputParallelTensorDesc const &input,
                                   ProfilingSettings const &settings,
                                   MachineView const &machine_view);