diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index aeea932f04..1220a93203 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -5,30 +5,37 @@ namespace FlexFlow { -class ReducePerDeviceState : public PerDeviceOpState { -public: - ReducePerDeviceState(FFHandler handler, - Reduce const *rd, - Legion::Domain const &input_domain); - ~ReducePerDeviceState(void); -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t inputTensor, outputTensor; - cudnnReduceTensorDescriptor_t reduceDesc; -#else - miopenTensorDescriptor_t inputTensor, outputTensor; - miopenReduceTensorDescriptor_t reduceDesc; -#endif +struct ReducePerDeviceState { + PerDeviceFFHandle handle; + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffReduceTensorDescriptor_t reduceDesc; OperatorType op_type; size_t reduction_size; }; +FF_VISITABLE_STRUCT(ReducePerDeviceState, + handle, + inputTensor, + outputTensor, + reduceDesc, + op_type, + reduction_size); + namespace Kernels { namespace Reduce { -void forward_kernel_wrapper(ReducePerDeviceState const *m, + +ReducePerDeviceState init_kernel(PerDeviceFFhandle const &, + OperatorType const &, + size_t const &, + ArrayShape input_shape, + ArrayShape output_shape); + +void forward_kernel_wrapper(ReducePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void backward_kernel_wrapper(ReducePerDeviceState const *m, +void backward_kernel_wrapper(ReducePerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu index d675f88073..a67f52999d 100644 --- a/lib/kernels/src/cuda/reduce_kernels.cu +++ b/lib/kernels/src/cuda/reduce_kernels.cu @@ -67,50 +67,73 @@ ReducePerDeviceState::~ReducePerDeviceState(void) { namespace Kernels { namespace Reduce { +ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle, + OperatorType const &op_type, + size_t const &reduction_size, + ArrayShape const &input_shape, + ArrayShape const &output_shape) { + + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffReduceTensorDescriptor_t reduceDesc; + + checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + ; + checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + + checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape)); + checkCUDNN( + cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + + ReducePerDeviceState per_device = { + handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size}; +} + void forward_kernel(cudaStream_t stream, - ReducePerDeviceState const *m, + ReducePerDeviceState const &m, float const *input_ptr, float *output_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnReduceTensor(m->handle.dnn, - m->reduceDesc, + checkCUDNN(cudnnReduceTensor(m.handle.dnn, + m.reduceDesc, nullptr /*indices*/, 0 /*indicesSizeInBytes*/, - m->handle.workSpace, - m->handle.workSpaceSize, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); }; void backward_kernel(cudaStream_t stream, - ReducePerDeviceState const *m, + ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0, beta = 1.0f; - switch (m->op_type) { + switch (m.op_type) { case OP_REDUCE_SUM: alpha = 1.0f; break; case OP_REDUCE_MEAN: // When the output is the average of multiple input elements // we need to scale the gradients by 1.0 / reduction_size - alpha = 1.0f / m->reduction_size; + alpha = 1.0f / m.reduction_size; break; default: assert(false); } - checkCUDNN(cudnnAddTensor(m->handle.dnn, + checkCUDNN(cudnnAddTensor(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, &beta, - m->inputTensor, + m.inputTensor, input_grad_ptr)); } diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 8879cad592..2674dc4fef 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -1,7 +1,10 @@ #include "reduce.h" #include "kernels/reduce_kernels.h" #include "legion/legion_utilities.h" +#include "op-attrs/get_output_shape.h" +#include "utils/exceptions.h" #include "utils/hash-utils.h" +#include "utils/type_traits_core.h" namespace FlexFlow { // declare Legion names @@ -22,414 +25,185 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reduce; -bool operator==(ReduceParams const &lhs, ReduceParams const &rhs) { - return (lhs.axes == rhs.axes) && (lhs.keepdims == rhs.keepdims); -} +enum Slots { + INPUT, + OUTPUT, + ATTRS, + PROFILING, + REDUCE, + PER_DEVICE_STATE, + HANDLE +}; + +OpTaskInvocation init(TransposeAttrs const &attrs) { + OpTaskBinding binding; + + binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(ATTRS, attrs); -bool ReduceParams::is_valid(ParallelTensorShape const &input) const { - for (size_t i = 0; i < axes.size(); i++) { - if (axes[i] >= input.num_dims) { - return false; - } - } - return input.is_valid(); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + + return {REDUCE_INIT_TASK_ID, binding}; } -ReduceParams Reduce::get_params() const { - ReduceParams params; - params.axes.clear(); - for (int i = 0; i < num_axes; i++) { - params.axes.push_back(this->axes[i]); - } - params.keepdims = keepdims; - return params; +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + auto attrs = acc.get_argument(ATTRS); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + + OperatorType = attrs.op_type; + // Note: How to set the reduction size? + size_t reduction_size = input.shape.get_volume() / output.shape.get_volume(); + DeviceSpecific per_device_state = + acc.create_device_specific(init_kernel( + handle, op_type, reduction_size, input.shape, output.shape)); + return per_device_state; } -Tensor FFModel::reduce_sum(OperatorType op, - const Tensor input, - std::vector const &_axes, - bool keepdims, - char const *name) { - Layer *rd = new Layer(this, - op, - DT_FLOAT, - name, - 1 /*input*/, - 0 /*weights*/, - 1 /*outputs*/, - input); - // Use Legion indexing to store axes - std::vector axes; - for (size_t i = 0; i < _axes.size(); i++) { - axes.push_back(input->num_dims - 1 - _axes[i]); - } - int dims[MAX_TENSOR_DIM]; - int numdim = input->num_dims; - if (keepdims) { - for (int i = 0; i < input->num_dims; i++) { - dims[i] = input->dims[i]; - } - for (size_t i = 0; i < axes.size(); i++) { - dims[axes[i]] = 1; - } - } else { - numdim = 0; - for (int i = 0; i < input->num_dims; i++) { - bool reduced = false; - for (size_t j = 0; j < axes.size(); j++) { - if (axes[j] == i) { - reduced = true; - } - } - if (!reduced) { - dims[numdim++] = input->dims[i]; - } - } - assert(numdim + axes.size() == input->num_dims); - } - rd->outputs[0] = create_tensor_legion_ordering( - numdim, dims, input->data_type, rd, 0, true /*create_grad*/); - rd->add_int_vector_property("legion_axes", axes); - rd->add_int_property("keepdims", keepdims); - layers.push_back(rd); - return rd->outputs[0]; +static DeviceSpecific + init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + return init_task_impl(acc); } -Tensor FFModel::reduce_sum(const Tensor input, - std::vector const &_axes, - bool keepdims, - char const *name) { - return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name); +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT) + + init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(ATTRS); + + init.add_return_value(); + + register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); } -Tensor FFModel::reduce_mean(const Tensor input, - std::vector const &_axes, - bool keepdims, - char const *name) { - return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name); +// Note: forward_kernel only needs ReducePerDeviceState, input, output +OpTaskInvocation forward(ReduceAttrs const &attrs) { + OpTaskBinding binding; + + bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); + bind.bind_arg(PROFILING, profiling_tensor()); + + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + + return {REDUCE_FWD_TASK_ID, binding}; } -Op *Reduce::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - std::vector axes; - long long value; - layer->get_int_vector_property("legion_axes", axes); - layer->get_int_property("keepdims", value); - bool keepdims = value; - return new Reduce( - model, layer->op_type, inputs[0], axes, keepdims, layer->name); +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Reduce] forward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + output.get_float_ptr()); } -Reduce::Reduce(FFModel &model, - ReduceParams const ¶ms, - const ParallelTensor input, - char const *name) - : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) { +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); } -Reduce::Reduce(FFModel &model, - OperatorType _op_type, - const ParallelTensor input, - std::vector const &_axes, - bool _keepdims, - char const *name) - : Op(model, - _op_type, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input), - num_axes(_axes.size()), keepdims(_keepdims) { - for (size_t i = 0; i < num_axes; i++) { - axes[i] = _axes[i]; - } - int num_dims = input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - if (keepdims) { - num_dims = input->num_dims; - for (int i = 0; i < num_dims; i++) { - dims[i] = input->dims[i]; - } - for (int i = 0; i < num_axes; i++) { - // Currently assume that we cannot parallelize along reduced dims - assert(dims[axes[i]].degree == 1); - dims[axes[i]].size = 1; - } - } else { - num_dims = 0; - for (int i = 0; i < input->num_dims; i++) { - bool reduced = false; - for (int j = 0; j < num_axes; j++) { - if (axes[j] == i) { - reduced = true; - } - } - if (!reduced) { - dims[num_dims++] = input->dims[i]; - } else { - // Currently assume that we cannot parallelize along reduced dims - assert(input->dims[i].degree == 1); - assert(input->dims[i].parallel_idx == -1); - } - } - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - num_dims, dims, input->data_type, this); +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FORWARD); + + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_arg_slot(PROFILING); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + + register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task); } -void Reduce::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(REDUCE_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reduce)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -}; +OpTaskInvocation backward(ReduceAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); -PerDeviceOpState *Reduce::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Reduce *rd = (Reduce *)task->args; - FFHandler handle = *((FFHandler *)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - ReduceMeta *m = new ReduceMeta(handle, rd, input.domain); - return m; + return {REDUCE_BWD_TASK_ID, binding}; } -void Reduce::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(REDUCE_FWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, false), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + + return profile(backward_kernel, + profiling, + "[Reduce] backward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + output.get_float_ptr()); } -void Reduce::forward_task(Task const *task, +static void backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - ReduceMeta const *m = *((ReduceMeta **)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - - forward_kernel_wrapper(m, input, output); + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); } -void Reduce::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(REDUCE_BWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[1](I/O): input_grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); -void Reduce::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - ReduceMeta const *m = *((ReduceMeta **)task->local_args); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad, input_grad); + reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); } -bool Reduce::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain()); - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorR input_acc( - inputs[0]->data_type, sub_input.get_domain(), input_ptr); - - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW output_acc( - outputs[0]->data_type, sub_output.get_domain(), output_ptr); - - assert(m->profiling == false); - - std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; - if (sim->computationMode == COMP_MODE_TRAINING) { - float *input_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW input_grad_acc( - inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr); - - float *output_grad_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorR output_grad_acc( - outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr); - - backward = [&] { - backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure Reduce] name(%s) forward_time(%.4lf) " - "backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } - - return true; -} +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + ReduceAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + auto env = sim.new_environment(); -void Reduce::serialize(Legion::Serializer &sez) const { - ReduceParams params = get_params(); - sez.serialize(params.op_type); - sez.serialize(params.axes.size()); - for (size_t i = 0; i < params.axes.size(); i++) { - sez.serialize(params.axes[i]); - } - sez.serialize(params.keepdims); -} + SimTaskBinding init_binding; + init_binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); -using PCG::Node; -Node Reduce::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - OperatorType op_type; - size_t axes_size; - bool keepdims; - std::vector axes; - dez.deserialize(op_type); - dez.deserialize(axes_size); - for (size_t i = 0; i < axes_size; i++) { - int dim_idx; - dez.deserialize(dim_idx); - axes.push_back(dim_idx); - } - dez.deserialize(keepdims); - return ff.get_or_create_node(inputs[0], {axes, op_type, keepdims}); -} + auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); -Op *Reduce::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - ReduceParams params = get_params(); - return new Reduce(ff, params, inputs[0], this->name); -} + SimTaskBinding fwd_binding; + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + fwd.bind(INPUT, input.shape); + fwd.bind(OUTPUT, output_shape); + fwd.bind_arg(PROFILING, settings); + fwd.bind_arg(PER_DEVICE_STATE, per_device_state); -}; // namespace FlexFlow + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding); -namespace std { -size_t hash::operator()( - FlexFlow::ReduceParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.op_type); - hash_combine(key, params.axes.size()); - for (int n : params.axes) { - hash_combine(key, n); - } - hash_combine(key, params.keepdims); - return key; + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } -}; // namespace std + +}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reduce.h b/lib/runtime/src/ops/reduce.h index 52b4eff609..099083ed67 100644 --- a/lib/runtime/src/ops/reduce.h +++ b/lib/runtime/src/ops/reduce.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #include "op-attrs/ops/reduce.h" -#include "op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(ReduceAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReduceAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view);