diff --git a/lib/op-attrs/include/op-attrs/get_output_shapes.h b/lib/op-attrs/include/op-attrs/get_output_shapes.h index bc8101d1d2..883fa51c90 100644 --- a/lib/op-attrs/include/op-attrs/get_output_shapes.h +++ b/lib/op-attrs/include/op-attrs/get_output_shapes.h @@ -3,6 +3,7 @@ #include "op-attrs/operator_attrs.h" #include "op-attrs/parallel_tensor_shape.h" +#include "ops/reverse.h" #include "tensor_shape.h" #include "utils/containers.h" #include "utils/optional.h" @@ -140,6 +141,8 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &, ParallelTensorShape const &); ParallelTensorShape get_output_shape(ReplicateAttrs const &, ParallelTensorShape const &); +ParallelTensorShape get_output_shape(ReverseAttrs const &, + ParallelTensorShape const &); std::vector get_output_shapes(SplitAttrs const &, ParallelTensorShape const &); ParallelTensorShape get_output_shape(SoftmaxAttrs const &, diff --git a/lib/runtime/src/ops/reverse.cc b/lib/runtime/src/ops/reverse.cc index 2ae9fea18d..ac64146cd1 100644 --- a/lib/runtime/src/ops/reverse.cc +++ b/lib/runtime/src/ops/reverse.cc @@ -14,7 +14,9 @@ */ #include "reverse.h" +#include "kernels/accessor.h" #include "kernels/reverse_kernels.h" +#include "op-attrs/get_output_shapes.h" namespace FlexFlow { // declare Legion names @@ -35,281 +37,145 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reverse; -Tensor FFModel::reverse(const Tensor input, int axis, char const *name) { - assert(false); -#ifdef DEADCODE - Reverse *reverse = new Reverse(*this, input, axis, name); - layers.push_back(reverse); - return reverse->outputs[0]; -#endif -} +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; -Reverse::Reverse(FFModel &model, - const ParallelTensor input, - int _axis, - char const *name) - : Op(model, - OP_REVERSE, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input), - axis(_axis) { - numOutputs = 1; - int numdim = input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = input->dims[i]; - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, input->data_type, this); -} +OpTaskInvocation forward(ReverseAttrs const &attrs) { + OpTaskBinding binding; -void Reverse::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(REVERSE_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reverse)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} + binding.bind_arg(PROFILING, profiling_settings()); + bind.bind_arg(ATTRS, attrs); + + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); -PerDeviceOpState *Reverse::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - return NULL; + return {REVERSE_FWD_TASK_ID, binding}; } +OpTaskInvocation backward(ReverseAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); -void Reverse::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(REVERSE_FWD_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reverse)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return {REVERSE_BWD_TASK_ID, binding}; } -void Reverse::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - Reverse const *reverse = (Reverse const *)task->args; - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(out_domain == in_domain); - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - int axis = in_domain.get_dim() - reverse->axis - 1; +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto attrs = acc.get_argument(ATTRS); + + int output_size = outtput.shape.get_volume(); + auto axis = attrs.axis; coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; - for (int i = 0; i < out_domain.get_dim(); i++) { + for (int i = 0; i < output.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= out_domain.hi()[i] - out_domain.lo()[i] + 1; + in_blk_size *= output.shape[i]; } else if (i == axis) { - reverse_dim_size = out_domain.hi()[i] - out_domain.lo()[i] + 1; + reverse_dim_size = output.shape[i]; } else { - num_out_blks *= out_domain.hi()[i] - out_domain.lo()[i] + 1; + num_out_blks *= output.shape[i]; } } - int output_size = out_domain.get_volume(); - forward_kernel_wrapper(in_ptr, - out_ptr, - num_out_blks, - reverse_dim_size, - in_blk_size, - output_size); + return profile(forward_kernel, + profiling, + "[reverse] forward_time = %.2lfms\n", + input.get_float_ptr(), + output.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + output_size); } -void Reverse::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(REVERSE_BWD_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reverse)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[1](I/O): input0_grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); } -void Reverse::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - Reverse const *reverse = (Reverse const *)task->args; - Domain out_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain in_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(out_grad_domain == in_grad_domain); - float const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - // We reuse the forward kernel for backward tasks - int axis = in_grad_domain.get_dim() - reverse->axis - 1; +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto attrs = acc.get_argument(ATTRS); + + int axis = input.shape.get_dim() - attrs.axis - 1; coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; - for (int i = 0; i < in_grad_domain.get_dim(); i++) { + for (int i = 0; i < input_grad.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= in_grad_domain.hi()[i] - in_grad_domain.lo()[i] + 1; + in_blk_size *= input_grad.shape[i]; } else if (i == axis) { - reverse_dim_size = in_grad_domain.hi()[i] - in_grad_domain.lo()[i] + 1; + reverse_dim_size = input_grad.shape[i]; } else { - num_out_blks *= in_grad_domain.hi()[i] - in_grad_domain.lo()[i] + 1; + num_out_blks *= input_grad.shape[i]; } } - backward_kernel_wrapper(out_grad_ptr, - in_grad_ptr, - num_out_blks, - reverse_dim_size, - in_blk_size, - in_grad_domain.get_volume()); + return profile(backward_kernel, + profiling, + "[reverse] backward_time = %.2lfms\n", + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + num_out_blks, + reverse_dim_size, + in_blk_size, + input.shape.get_volume()); } -bool Reverse::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } +static void backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + ReverseAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + auto env = sim.new_environment(); - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + SimTaskBinding fwd_binding; - coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; - for (int i = 0; i < sub_output.num_dims; i++) { - if (i < axis) { - in_blk_size *= sub_output.dims[i].size; - } else if (i == axis) { - reverse_dim_size = sub_output.dims[i].size; - } else { - num_out_blks *= sub_output.dims[i].size; - } - } + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(input_ptr, - output_ptr, - num_out_blks, - reverse_dim_size, - in_blk_size, - sub_output.get_volume()); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - float *input_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + fwd_binding.bind(INPUT, input.shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(ATTRS, attrs); - float *output_grad_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); + auto fwd_accessor = env.get_fwd_accessor(REVERSE_FWD_TASK_ID, fwd_binding); - backward = [&] { - backward_kernel_wrapper(output_grad_ptr, - input_grad_ptr, - num_out_blks, - reverse_dim_size, - in_blk_size, - sub_input.get_volume()); - }; - } + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(REVERSE_BWD_TASK_ID, bwd_binding); - inner_measure_operator_cost(sim, forward, backward, cost_metrics); + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); - if (sim->computationMode == COMP_MODE_TRAINING) { - printf( - "[Measure Reverse] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Reverse] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } + float sync_time = default_estimate_sync_time(env); + + return make_metrics(forward_time, backward_time, sync_time, env); +} + +template <> +void register_task()) { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_arg_slot(PROFILING); + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + + register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task); +} - return true; +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID)); + register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, backward_task); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reverse.h b/lib/runtime/src/ops/reverse.h index 384f0b58ae..af4d335429 100644 --- a/lib/runtime/src/ops/reverse.h +++ b/lib/runtime/src/ops/reverse.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_REVERSE_H_ #include "op-attrs/ops/reverse.h" -#include "op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -20,43 +20,10 @@ OpTaskInvocation backward(ReverseAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReverseAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); -/* class Reverse : public Op { */ -/* public: */ -/* Reverse(FFModel &model, */ -/* const ParallelTensor input, */ -/* int axis, */ -/* char const *name); */ -/* void init(FFModel const &) override; */ -/* void forward(FFModel const &) override; */ -/* void backward(FFModel const &) override; */ - -/* static PerDeviceOpState *init_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void forward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void backward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* bool measure_operator_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ - -/* public: */ -/* int axis; */ -/* }; */ - } // namespace FlexFlow #endif