From c5e320283a39d30e4cdf4737799eef3aa34fbcf7 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 05:51:23 +0000 Subject: [PATCH 01/13] add init_kernel for reduction --- deps/fmt | 2 +- lib/kernels/include/kernels/reduce_kernels.h | 29 +++++++++++--------- lib/kernels/src/cuda/reduce_kernels.cu | 9 ++++++ 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/deps/fmt b/deps/fmt index f5e54359df..a33701196a 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 +Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index aeea932f04..99251ee6dd 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -5,25 +5,28 @@ namespace FlexFlow { -class ReducePerDeviceState : public PerDeviceOpState { -public: - ReducePerDeviceState(FFHandler handler, - Reduce const *rd, - Legion::Domain const &input_domain); - ~ReducePerDeviceState(void); -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - cudnnTensorDescriptor_t inputTensor, outputTensor; - cudnnReduceTensorDescriptor_t reduceDesc; -#else - miopenTensorDescriptor_t inputTensor, outputTensor; - miopenReduceTensorDescriptor_t reduceDesc; -#endif +struct ReducePerDeviceState { + PerDeviceFFHandle handle; + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffReduceTensorDescriptor_t reduceDesc; OperatorType op_type; size_t reduction_size; }; +FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState, handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size); + + namespace Kernels { namespace Reduce { + +ReducePerDeviceState init_kernel(PerDeviceFFhandle const &, + ffTensorDescriptor_t const &, + ffTensorDescriptor_t const &, + ffReduceTensorDescriptor_t const &, + OperatorType const &, + size_t const &); + void forward_kernel_wrapper(ReducePerDeviceState const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu index d675f88073..245b0d06e5 100644 --- a/lib/kernels/src/cuda/reduce_kernels.cu +++ b/lib/kernels/src/cuda/reduce_kernels.cu @@ -67,6 +67,15 @@ ReducePerDeviceState::~ReducePerDeviceState(void) { namespace Kernels { namespace Reduce { +ReducePerDeviceState init_kernel(PerDeviceFFhandle const & handle, + ffTensorDescriptor_t const & input_tensor, + ffTensorDescriptor_t const & outputTensor, + ffReduceTensorDescriptor_t const & reduceDesc, + OperatorType const & op_type, + size_t const & reduction_size) { + return {handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size}; +} + void forward_kernel(cudaStream_t stream, ReducePerDeviceState const *m, float const *input_ptr, From 2ce00f01e773c05f00417c8191167c79748fa642 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 06:12:24 +0000 Subject: [PATCH 02/13] implement the init --- lib/runtime/src/ops/reduce.cc | 801 ++++++++++++++++++---------------- lib/runtime/src/ops/reduce.h | 4 +- 2 files changed, 434 insertions(+), 371 deletions(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 8879cad592..5270006bc7 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -1,7 +1,9 @@ #include "reduce.h" #include "kernels/reduce_kernels.h" #include "legion/legion_utilities.h" +#include "utils/exception.decl.h" #include "utils/hash-utils.h" +#include "op-attrs/get_output_shape.h" namespace FlexFlow { // declare Legion names @@ -45,391 +47,452 @@ ReduceParams Reduce::get_params() const { return params; } -Tensor FFModel::reduce_sum(OperatorType op, - const Tensor input, - std::vector const &_axes, - bool keepdims, - char const *name) { - Layer *rd = new Layer(this, - op, - DT_FLOAT, - name, - 1 /*input*/, - 0 /*weights*/, - 1 /*outputs*/, - input); - // Use Legion indexing to store axes - std::vector axes; - for (size_t i = 0; i < _axes.size(); i++) { - axes.push_back(input->num_dims - 1 - _axes[i]); - } - int dims[MAX_TENSOR_DIM]; - int numdim = input->num_dims; - if (keepdims) { - for (int i = 0; i < input->num_dims; i++) { - dims[i] = input->dims[i]; - } - for (size_t i = 0; i < axes.size(); i++) { - dims[axes[i]] = 1; - } - } else { - numdim = 0; - for (int i = 0; i < input->num_dims; i++) { - bool reduced = false; - for (size_t j = 0; j < axes.size(); j++) { - if (axes[j] == i) { - reduced = true; - } - } - if (!reduced) { - dims[numdim++] = input->dims[i]; - } - } - assert(numdim + axes.size() == input->num_dims); - } - rd->outputs[0] = create_tensor_legion_ordering( - numdim, dims, input->data_type, rd, 0, true /*create_grad*/); - rd->add_int_vector_property("legion_axes", axes); - rd->add_int_property("keepdims", keepdims); - layers.push_back(rd); - return rd->outputs[0]; -} +enum Slots {INPUT, OUTPUT, ATTRS, PROFILING, REDUCE, PER_DEVICE_STATE, HANDLE}; -Tensor FFModel::reduce_sum(const Tensor input, - std::vector const &_axes, - bool keepdims, - char const *name) { - return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name); -} +OpTaskInvocation init(TransposeAttrs const &attrs) { + OpTaskBinding binding; -Tensor FFModel::reduce_mean(const Tensor input, - std::vector const &_axes, - bool keepdims, - char const *name) { - return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name); -} + binding.bind_arg(HANDLE, ff_handle());. + binding.bind_arg(ATTRS, attrs); -Op *Reduce::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - std::vector axes; - long long value; - layer->get_int_vector_property("legion_axes", axes); - layer->get_int_property("keepdims", value); - bool keepdims = value; - return new Reduce( - model, layer->op_type, inputs[0], axes, keepdims, layer->name); + bind.bind(REDUCE, reduce_descriptor());//Note: how to bind ffReduceTensorDescriptor_t ? + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); } -Reduce::Reduce(FFModel &model, - ReduceParams const ¶ms, - const ParallelTensor input, - char const *name) - : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) { +static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto attrs = acc.get_argument(ATTRS); + auto reduce_desc = acc.get_argument(REDUCE); //Note: this may has some problem, I should ask Colin + OperatorType = attrs.op_type; + //Note: How to set the reduction size? + size_t reduction_size + DeviceSpecific per_device_state = acc.create_device_specific(init_kernel(handle, input, output, reduce_desc, op_type, reduction_size)); + return per_device_state; } -Reduce::Reduce(FFModel &model, - OperatorType _op_type, - const ParallelTensor input, - std::vector const &_axes, - bool _keepdims, - char const *name) - : Op(model, - _op_type, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input), - num_axes(_axes.size()), keepdims(_keepdims) { - for (size_t i = 0; i < num_axes; i++) { - axes[i] = _axes[i]; - } - int num_dims = input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - if (keepdims) { - num_dims = input->num_dims; - for (int i = 0; i < num_dims; i++) { - dims[i] = input->dims[i]; - } - for (int i = 0; i < num_axes; i++) { - // Currently assume that we cannot parallelize along reduced dims - assert(dims[axes[i]].degree == 1); - dims[axes[i]].size = 1; - } - } else { - num_dims = 0; - for (int i = 0; i < input->num_dims; i++) { - bool reduced = false; - for (int j = 0; j < num_axes; j++) { - if (axes[j] == i) { - reduced = true; - } - } - if (!reduced) { - dims[num_dims++] = input->dims[i]; - } else { - // Currently assume that we cannot parallelize along reduced dims - assert(input->dims[i].degree == 1); - assert(input->dims[i].parallel_idx == -1); - } - } - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - num_dims, dims, input->data_type, this); +static DeviceSpecific + init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + return init_task_impl(acc); } -void Reduce::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(REDUCE_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reduce)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -}; - -PerDeviceOpState *Reduce::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Reduce *rd = (Reduce *)task->args; - FFHandler handle = *((FFHandler *)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - ReduceMeta *m = new ReduceMeta(handle, rd, input.domain); - return m; -} +plate <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT) -void Reduce::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(REDUCE_FWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, false), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} + init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(ATTRS); -void Reduce::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - ReduceMeta const *m = *((ReduceMeta **)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - - forward_kernel_wrapper(m, input, output); -} + init.add_input_slot(INPUT); + init.add_output_slot(OUTPUT); + //Note:how to add reduce? -void Reduce::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(REDUCE_BWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[1](I/O): input_grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} + register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); +} -void Reduce::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - ReduceMeta const *m = *((ReduceMeta **)task->local_args); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad, input_grad); -} - -bool Reduce::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain()); - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorR input_acc( - inputs[0]->data_type, sub_input.get_domain(), input_ptr); - - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW output_acc( - outputs[0]->data_type, sub_output.get_domain(), output_ptr); - - assert(m->profiling == false); - - std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; - if (sim->computationMode == COMP_MODE_TRAINING) { - float *input_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW input_grad_acc( - inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr); - - float *output_grad_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorR output_grad_acc( - outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr); - - backward = [&] { - backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure Reduce] name(%s) forward_time(%.4lf) " - "backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } - - return true; -} - -void Reduce::serialize(Legion::Serializer &sez) const { - ReduceParams params = get_params(); - sez.serialize(params.op_type); - sez.serialize(params.axes.size()); - for (size_t i = 0; i < params.axes.size(); i++) { - sez.serialize(params.axes[i]); - } - sez.serialize(params.keepdims); -} +OpTaskInvocation forward(ReduceAttrs const & attrs) { + OpTaskBinding binding; -using PCG::Node; -Node Reduce::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - OperatorType op_type; - size_t axes_size; - bool keepdims; - std::vector axes; - dez.deserialize(op_type); - dez.deserialize(axes_size); - for (size_t i = 0; i < axes_size; i++) { - int dim_idx; - dez.deserialize(dim_idx); - axes.push_back(dim_idx); - } - dez.deserialize(keepdims); - return ff.get_or_create_node(inputs[0], {axes, op_type, keepdims}); -} + bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); + bind.bind_arg(PROFILING, profiling_tensor()); -Op *Reduce::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - ReduceParams params = get_params(); - return new Reduce(ff, params, inputs[0], this->name); + binding.bind(REDUCE, reduce_descriptor());//Note:this may have some problem, how to bind the ffReduceTensorDescriptor_t + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); } -}; // namespace FlexFlow -namespace std { -size_t hash::operator()( - FlexFlow::ReduceParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.op_type); - hash_combine(key, params.axes.size()); - for (int n : params.axes) { - hash_combine(key, n); - } - hash_combine(key, params.keepdims); - return key; -} +// Tensor FFModel::reduce_sum(OperatorType op, +// const Tensor input, +// std::vector const &_axes, +// bool keepdims, +// char const *name) { +// Layer *rd = new Layer(this, +// op, +// DT_FLOAT, +// name, +// 1 /*input*/, +// 0 /*weights*/, +// 1 /*outputs*/, +// input); +// // Use Legion indexing to store axes +// std::vector axes; +// for (size_t i = 0; i < _axes.size(); i++) { +// axes.push_back(input->num_dims - 1 - _axes[i]); +// } +// int dims[MAX_TENSOR_DIM]; +// int numdim = input->num_dims; +// if (keepdims) { +// for (int i = 0; i < input->num_dims; i++) { +// dims[i] = input->dims[i]; +// } +// for (size_t i = 0; i < axes.size(); i++) { +// dims[axes[i]] = 1; +// } +// } else { +// numdim = 0; +// for (int i = 0; i < input->num_dims; i++) { +// bool reduced = false; +// for (size_t j = 0; j < axes.size(); j++) { +// if (axes[j] == i) { +// reduced = true; +// } +// } +// if (!reduced) { +// dims[numdim++] = input->dims[i]; +// } +// } +// assert(numdim + axes.size() == input->num_dims); +// } +// rd->outputs[0] = create_tensor_legion_ordering( +// numdim, dims, input->data_type, rd, 0, true /*create_grad*/); +// rd->add_int_vector_property("legion_axes", axes); +// rd->add_int_property("keepdims", keepdims); +// layers.push_back(rd); +// return rd->outputs[0]; +// } + +// Tensor FFModel::reduce_sum(const Tensor input, +// std::vector const &_axes, +// bool keepdims, +// char const *name) { +// return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name); +// } + +// Tensor FFModel::reduce_mean(const Tensor input, +// std::vector const &_axes, +// bool keepdims, +// char const *name) { +// return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name); +// } + +// Op *Reduce::create_operator_from_layer( +// FFModel &model, +// Layer const *layer, +// std::vector const &inputs) { +// std::vector axes; +// long long value; +// layer->get_int_vector_property("legion_axes", axes); +// layer->get_int_property("keepdims", value); +// bool keepdims = value; +// return new Reduce( +// model, layer->op_type, inputs[0], axes, keepdims, layer->name); +// } + +// Reduce::Reduce(FFModel &model, +// ReduceParams const ¶ms, +// const ParallelTensor input, +// char const *name) +// : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) { +// } + +// Reduce::Reduce(FFModel &model, +// OperatorType _op_type, +// const ParallelTensor input, +// std::vector const &_axes, +// bool _keepdims, +// char const *name) +// : Op(model, +// _op_type, +// input->data_type, +// name, +// 1 /*inputs*/, +// 0 /*weights*/, +// 1 /*outputs*/, +// input), +// num_axes(_axes.size()), keepdims(_keepdims) { +// for (size_t i = 0; i < num_axes; i++) { +// axes[i] = _axes[i]; +// } +// int num_dims = input->num_dims; +// ParallelDim dims[MAX_TENSOR_DIM]; +// if (keepdims) { +// num_dims = input->num_dims; +// for (int i = 0; i < num_dims; i++) { +// dims[i] = input->dims[i]; +// } +// for (int i = 0; i < num_axes; i++) { +// // Currently assume that we cannot parallelize along reduced dims +// assert(dims[axes[i]].degree == 1); +// dims[axes[i]].size = 1; +// } +// } else { +// num_dims = 0; +// for (int i = 0; i < input->num_dims; i++) { +// bool reduced = false; +// for (int j = 0; j < num_axes; j++) { +// if (axes[j] == i) { +// reduced = true; +// } +// } +// if (!reduced) { +// dims[num_dims++] = input->dims[i]; +// } else { +// // Currently assume that we cannot parallelize along reduced dims +// assert(input->dims[i].degree == 1); +// assert(input->dims[i].parallel_idx == -1); +// } +// } +// } +// outputs[0] = model.create_parallel_tensor_legion_ordering( +// num_dims, dims, input->data_type, this); +// } + +// void Reduce::init(FFModel const &ff) { +// assert(check_output_input_weight_same_parallel_is()); +// parallel_is = outputs[0]->parallel_is; +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_init(ff, argmap); +// IndexLauncher launcher(REDUCE_INIT_TASK_ID, +// parallel_is, +// TaskArgument(this, sizeof(Reduce)), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[0]->region)); +// launcher.add_field(1, FID_DATA); +// FutureMap fm = runtime->execute_index_space(ctx, launcher); +// fm.wait_all_results(); +// set_opmeta_from_futuremap(ff, fm); +// }; + +// PerDeviceOpState *Reduce::init_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// Reduce *rd = (Reduce *)task->args; +// FFHandler handle = *((FFHandler *)task->local_args); +// GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( +// DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); +// GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( +// DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); +// ReduceMeta *m = new ReduceMeta(handle, rd, input.domain); +// return m; +// } + +// void Reduce::forward(FFModel const &ff) { +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_forward(ff, argmap); +// IndexLauncher launcher(REDUCE_FWD_TASK_ID, +// parallel_is, +// TaskArgument(nullptr, false), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[0]->region)); +// launcher.add_field(1, FID_DATA); +// runtime->execute_index_space(ctx, launcher); +// } + +// void Reduce::forward_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// assert(regions.size() == 2); +// assert(task->regions.size() == 2); +// ReduceMeta const *m = *((ReduceMeta **)task->local_args); +// GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( +// DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); +// GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( +// DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); + +// forward_kernel_wrapper(m, input, output); +// } + +// void Reduce::backward(FFModel const &ff) { +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_backward(ff, argmap); +// IndexLauncher launcher(REDUCE_BWD_TASK_ID, +// parallel_is, +// TaskArgument(nullptr, 0), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// // regions[0](I): output_grad +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// outputs[0]->region_grad)); +// launcher.add_field(0, FID_DATA); +// // regions[1](I/O): input_grad +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, +// 0 /*projection id*/, +// READ_WRITE, +// EXCLUSIVE, +// inputs[0]->region_grad)); +// launcher.add_field(1, FID_DATA); +// runtime->execute_index_space(ctx, launcher); +// } + +// void Reduce::backward_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// assert(regions.size() == 2); +// assert(task->regions.size() == 2); +// ReduceMeta const *m = *((ReduceMeta **)task->local_args); +// GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( +// DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); +// GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( +// DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); +// backward_kernel_wrapper(m, output_grad, input_grad); +// } + +// bool Reduce::measure_operator_cost(Simulator *sim, +// MachineView const &mv, +// CostMetrics &cost_metrics) const { +// ParallelTensorBase sub_input, sub_output; +// if (!outputs[0]->get_sub_tensor(mv, sub_output)) { +// return false; +// } +// if (!inputs[0]->get_sub_tensor(mv, sub_input)) { +// return false; +// } +// ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain()); +// sim->free_all(); +// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); +// assert(input_ptr != NULL); +// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// GenericTensorAccessorR input_acc( +// inputs[0]->data_type, sub_input.get_domain(), input_ptr); + +// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); +// assert(output_ptr != NULL); +// cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// GenericTensorAccessorW output_acc( +// outputs[0]->data_type, sub_output.get_domain(), output_ptr); + +// assert(m->profiling == false); + +// std::function forward, backward; +// forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; +// if (sim->computationMode == COMP_MODE_TRAINING) { +// float *input_grad_ptr = +// (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); +// assert(input_grad_ptr != NULL); +// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// GenericTensorAccessorW input_grad_acc( +// inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr); + +// float *output_grad_ptr = +// (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); +// assert(output_grad_ptr != NULL); +// cost_metrics.outputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); +// GenericTensorAccessorR output_grad_acc( +// outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr); + +// backward = [&] { +// backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); +// }; +// } + +// inner_measure_operator_cost(sim, forward, backward, cost_metrics); + +// if (sim->computationMode == COMP_MODE_TRAINING) { +// printf("[Measure Reduce] name(%s) forward_time(%.4lf) " +// "backward_time(%.4lf)\n", +// name, +// cost_metrics.forward_time, +// cost_metrics.backward_time); +// } else { +// printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n", +// name, +// cost_metrics.forward_time); +// } + +// return true; +// } + +// void Reduce::serialize(Legion::Serializer &sez) const { +// ReduceParams params = get_params(); +// sez.serialize(params.op_type); +// sez.serialize(params.axes.size()); +// for (size_t i = 0; i < params.axes.size(); i++) { +// sez.serialize(params.axes[i]); +// } +// sez.serialize(params.keepdims); +// } + +// using PCG::Node; +// Node Reduce::deserialize(FFModel &ff, +// Legion::Deserializer &dez, +// ParallelTensor inputs[], +// int num_inputs) { +// assert(num_inputs == 1); +// OperatorType op_type; +// size_t axes_size; +// bool keepdims; +// std::vector axes; +// dez.deserialize(op_type); +// dez.deserialize(axes_size); +// for (size_t i = 0; i < axes_size; i++) { +// int dim_idx; +// dez.deserialize(dim_idx); +// axes.push_back(dim_idx); +// } +// dez.deserialize(keepdims); +// return ff.get_or_create_node(inputs[0], {axes, op_type, keepdims}); +// } + +// Op *Reduce::materialize(FFModel &ff, +// ParallelTensor inputs[], +// int num_inputs) const { +// ReduceParams params = get_params(); +// return new Reduce(ff, params, inputs[0], this->name); +// } + +// }; // namespace FlexFlow + +// namespace std { +// size_t hash::operator()( +// FlexFlow::ReduceParams const ¶ms) const { +// size_t key = 0; +// hash_combine(key, params.op_type); +// hash_combine(key, params.axes.size()); +// for (int n : params.axes) { +// hash_combine(key, n); +// } +// hash_combine(key, params.keepdims); +// return key; +// } }; // namespace std diff --git a/lib/runtime/src/ops/reduce.h b/lib/runtime/src/ops/reduce.h index 52b4eff609..fd3df85281 100644 --- a/lib/runtime/src/ops/reduce.h +++ b/lib/runtime/src/ops/reduce.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #include "op-attrs/ops/reduce.h" -#include "op_task_invocation.h" +#include "task_spec/op_task_invocation.h" #include "sim_environment.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(ReduceAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReduceAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); From 356cdde52eb352659d94a756f5b211c5f0e33103 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 06:18:52 +0000 Subject: [PATCH 03/13] add forward API --- lib/runtime/src/ops/reduce.cc | 46 ++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 5270006bc7..1bed6ee959 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -55,19 +55,17 @@ OpTaskInvocation init(TransposeAttrs const &attrs) { binding.bind_arg(HANDLE, ff_handle());. binding.bind_arg(ATTRS, attrs); - bind.bind(REDUCE, reduce_descriptor());//Note: how to bind ffReduceTensorDescriptor_t ? - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + return {REDUCE_INIT_TASK_ID, binding}; } static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - auto reduce_desc = acc.get_argument(REDUCE); //Note: this may has some problem, I should ask Colin OperatorType = attrs.op_type; //Note: How to set the reduction size? + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffReduceTensorDescriptor_t reduceDesc; size_t reduction_size DeviceSpecific per_device_state = acc.create_device_specific(init_kernel(handle, input, output, reduce_desc, op_type, reduction_size)); return per_device_state; @@ -87,24 +85,48 @@ void register_task() { OpTaskSignature init(OpTaskType::INIT) init.add_unchecked_arg_slot(HANDLE); - init.add_arg_slot(ATTRS); - - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT); - //Note:how to add reduce? + init.add_arg_slot(ATTRS); register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); } +//Note: forward_kernel only needs ReducePerDeviceState, input, output OpTaskInvocation forward(ReduceAttrs const & attrs) { OpTaskBinding binding; bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); bind.bind_arg(PROFILING, profiling_tensor()); - binding.bind(REDUCE, reduce_descriptor());//Note:this may have some problem, how to bind the ffReduceTensorDescriptor_t binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); + + return {REDUCE_FWD_TASK_ID, binding}; +} + + +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + NOT_IMPLEMENTED(); +} + +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); +} + +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FORWARD); + + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_arg_slot(PROFILING); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + + register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task); } From c45a318995d64f8ec7de1f481715ce9d5a679a22 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 06:21:57 +0000 Subject: [PATCH 04/13] add backward API --- lib/runtime/src/ops/reduce.cc | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 1bed6ee959..f5801fa584 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -129,6 +129,30 @@ void register_task() { register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task); } +OpTaskInvocation backward(ReduceAttrs const & attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); + + return {REDUCE_BWD_TASK_ID, binding}; +} + +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + NOT_IMPLEMENTED(); +} + +static void backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} + +template <> +void register_task() { + OpTaskSignature bwd = infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); + + reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); +} // Tensor FFModel::reduce_sum(OperatorType op, // const Tensor input, From 074f1fe50adf74acc1014d8bfeef042d20c3dd20 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 06:33:53 +0000 Subject: [PATCH 05/13] implement measure_operator_cost and Reduce version 0.1 --- lib/runtime/src/ops/reduce.cc | 60 +++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index f5801fa584..ea56c86268 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -4,6 +4,7 @@ #include "utils/exception.decl.h" #include "utils/hash-utils.h" #include "op-attrs/get_output_shape.h" +#include "utils/type_traits_core.h" namespace FlexFlow { // declare Legion names @@ -105,7 +106,18 @@ OpTaskInvocation forward(ReduceAttrs const & attrs) { static optional forward_task_impl(TaskArgumentAccessor const &acc) { - NOT_IMPLEMENTED(); + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Reduce] forward_time = %.2lfms\n", + &per_device_state, + input.get_float_ptr(), + output.get_float_ptr()); } static void forward_task(Task const *task, @@ -136,7 +148,18 @@ OpTaskInvocation backward(ReduceAttrs const & attrs) { } static optional backward_task_impl(TaskArgumentAccessor const &acc) { - NOT_IMPLEMENTED(); + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + + return profile(backward_kernel, + profiling, + "[Reduce] backward_time = %.2lfms\n", + &per_device_state, + input.get_float_ptr(), + output.get_float_ptr()); } static void backward_task(Task const *task, @@ -154,6 +177,39 @@ void register_task() { reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); } +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + ReduceAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + auto env = sim.new_environment(); + + SimTaskBinding init_binding; + init_binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); + + auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding); + DeviceSpecific per_device_state = init_task_impl(init_accessor); + + SimTaskBinding fwd_binding; + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + fwd.bind(INPUT, input); + fwd.bind(OUTPUT, output_shape); + fwd.bind_arg(PROFILING, settings); + fwd.bind_arg(PER_DEVICE_STATE, per_device_state); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); + } + // Tensor FFModel::reduce_sum(OperatorType op, // const Tensor input, // std::vector const &_axes, From db204dd75e2bf1dc6d1e74083bafa134f56f36a4 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 06:34:10 +0000 Subject: [PATCH 06/13] format the code --- lib/kernels/include/kernels/reduce_kernels.h | 19 ++- lib/kernels/src/cuda/reduce_kernels.cu | 15 ++- lib/runtime/src/ops/reduce.cc | 134 ++++++++++--------- lib/runtime/src/ops/reduce.h | 2 +- 4 files changed, 95 insertions(+), 75 deletions(-) diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 99251ee6dd..7d7f2fc7f0 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -14,18 +14,23 @@ struct ReducePerDeviceState { size_t reduction_size; }; -FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState, handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size); - +FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState, + handle, + inputTensor, + outputTensor, + reduceDesc, + op_type, + reduction_size); namespace Kernels { namespace Reduce { ReducePerDeviceState init_kernel(PerDeviceFFhandle const &, - ffTensorDescriptor_t const &, - ffTensorDescriptor_t const &, - ffReduceTensorDescriptor_t const &, - OperatorType const &, - size_t const &); + ffTensorDescriptor_t const &, + ffTensorDescriptor_t const &, + ffReduceTensorDescriptor_t const &, + OperatorType const &, + size_t const &); void forward_kernel_wrapper(ReducePerDeviceState const *m, GenericTensorAccessorR const &input, diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu index 245b0d06e5..87e3be991c 100644 --- a/lib/kernels/src/cuda/reduce_kernels.cu +++ b/lib/kernels/src/cuda/reduce_kernels.cu @@ -67,13 +67,14 @@ ReducePerDeviceState::~ReducePerDeviceState(void) { namespace Kernels { namespace Reduce { -ReducePerDeviceState init_kernel(PerDeviceFFhandle const & handle, - ffTensorDescriptor_t const & input_tensor, - ffTensorDescriptor_t const & outputTensor, - ffReduceTensorDescriptor_t const & reduceDesc, - OperatorType const & op_type, - size_t const & reduction_size) { - return {handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size}; +ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle, + ffTensorDescriptor_t const &input_tensor, + ffTensorDescriptor_t const &outputTensor, + ffReduceTensorDescriptor_t const &reduceDesc, + OperatorType const &op_type, + size_t const &reduction_size) { + return { + handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size}; } void forward_kernel(cudaStream_t stream, diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index ea56c86268..c06a75c7da 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -1,9 +1,9 @@ #include "reduce.h" #include "kernels/reduce_kernels.h" #include "legion/legion_utilities.h" +#include "op-attrs/get_output_shape.h" #include "utils/exception.decl.h" #include "utils/hash-utils.h" -#include "op-attrs/get_output_shape.h" #include "utils/type_traits_core.h" namespace FlexFlow { @@ -48,27 +48,37 @@ ReduceParams Reduce::get_params() const { return params; } -enum Slots {INPUT, OUTPUT, ATTRS, PROFILING, REDUCE, PER_DEVICE_STATE, HANDLE}; +enum Slots { + INPUT, + OUTPUT, + ATTRS, + PROFILING, + REDUCE, + PER_DEVICE_STATE, + HANDLE +}; OpTaskInvocation init(TransposeAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(HANDLE, ff_handle());. - binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); + .binding.bind_arg(ATTRS, attrs); return {REDUCE_INIT_TASK_ID, binding}; } -static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto attrs = acc.get_argument(ATTRS); OperatorType = attrs.op_type; - //Note: How to set the reduction size? + // Note: How to set the reduction size? ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; ffReduceTensorDescriptor_t reduceDesc; - size_t reduction_size - DeviceSpecific per_device_state = acc.create_device_specific(init_kernel(handle, input, output, reduce_desc, op_type, reduction_size)); + size_t reduction_size DeviceSpecific per_device_state = + acc.create_device_specific(init_kernel( + handle, input, output, reduce_desc, op_type, reduction_size)); return per_device_state; } @@ -81,18 +91,17 @@ static DeviceSpecific return init_task_impl(acc); } -plate <> -void register_task() { - OpTaskSignature init(OpTaskType::INIT) +plate<> void register_task() { + OpTaskSignature init(OpTaskType::INIT) - init.add_unchecked_arg_slot(HANDLE); - init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(ATTRS); - register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); -} + register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); +} -//Note: forward_kernel only needs ReducePerDeviceState, input, output -OpTaskInvocation forward(ReduceAttrs const & attrs) { +// Note: forward_kernel only needs ReducePerDeviceState, input, output +OpTaskInvocation forward(ReduceAttrs const &attrs) { OpTaskBinding binding; bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); @@ -104,9 +113,9 @@ OpTaskInvocation forward(ReduceAttrs const & attrs) { return {REDUCE_FWD_TASK_ID, binding}; } - static optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -117,7 +126,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { "[Reduce] forward_time = %.2lfms\n", &per_device_state, input.get_float_ptr(), - output.get_float_ptr()); + output.get_float_ptr()); } static void forward_task(Task const *task, @@ -141,14 +150,15 @@ void register_task() { register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task); } -OpTaskInvocation backward(ReduceAttrs const & attrs) { +OpTaskInvocation backward(ReduceAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); return {REDUCE_BWD_TASK_ID, binding}; } static optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -159,7 +169,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { "[Reduce] backward_time = %.2lfms\n", &per_device_state, input.get_float_ptr(), - output.get_float_ptr()); + output.get_float_ptr()); } static void backward_task(Task const *task, @@ -172,9 +182,10 @@ static void backward_task(Task const *task, template <> void register_task() { - OpTaskSignature bwd = infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); - reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); + reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -182,33 +193,34 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim.new_environment(); - SimTaskBinding init_binding; - init_binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); + SimTaskBinding init_binding; + init_binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); - auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding); - DeviceSpecific per_device_state = init_task_impl(init_accessor); + auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); - SimTaskBinding fwd_binding; - ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - fwd.bind(INPUT, input); - fwd.bind(OUTPUT, output_shape); - fwd.bind_arg(PROFILING, settings); - fwd.bind_arg(PER_DEVICE_STATE, per_device_state); + SimTaskBinding fwd_binding; + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + fwd.bind(INPUT, input); + fwd.bind(OUTPUT, output_shape); + fwd.bind_arg(PROFILING, settings); + fwd.bind_arg(PER_DEVICE_STATE, per_device_state); - SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = env.get_fwd_accessor(REDUCE_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(REDUCE_BWD_TASK_ID, bwd_binding); - float forward_time = forward_task_impl(fwd_accessor).value(); - float backward_time = backward_task_impl(bwd_accessor).value(); + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); - float sync_time = default_estimate_sync_time(env); - return make_metrics(forward_time, backward_time, sync_time, env); - } + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); +} // Tensor FFModel::reduce_sum(OperatorType op, // const Tensor input, @@ -291,7 +303,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, // ReduceParams const ¶ms, // const ParallelTensor input, // char const *name) -// : Reduce(model, params.op_type, input, params.axes, params.keepdims, name) { +// : Reduce(model, params.op_type, input, params.axes, params.keepdims, +// name) { // } // Reduce::Reduce(FFModel &model, @@ -379,9 +392,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, // }; // PerDeviceOpState *Reduce::init_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { +// std::vector const +// ®ions, Context ctx, Runtime *runtime) +// { // Reduce *rd = (Reduce *)task->args; // FFHandler handle = *((FFHandler *)task->local_args); // GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( @@ -491,16 +504,16 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, // } // ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain()); // sim->free_all(); -// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); -// assert(input_ptr != NULL); -// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); -// GenericTensorAccessorR input_acc( +// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), +// DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorR +// input_acc( // inputs[0]->data_type, sub_input.get_domain(), input_ptr); -// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); -// assert(output_ptr != NULL); -// cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); -// GenericTensorAccessorW output_acc( +// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), +// DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW +// output_acc( // outputs[0]->data_type, sub_output.get_domain(), output_ptr); // assert(m->profiling == false); @@ -511,8 +524,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, // float *input_grad_ptr = // (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); // assert(input_grad_ptr != NULL); -// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); -// GenericTensorAccessorW input_grad_acc( +// cost_metrics.inputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW +// input_grad_acc( // inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr); // float *output_grad_ptr = @@ -597,4 +611,4 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, // hash_combine(key, params.keepdims); // return key; // } -}; // namespace std +}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reduce.h b/lib/runtime/src/ops/reduce.h index fd3df85281..099083ed67 100644 --- a/lib/runtime/src/ops/reduce.h +++ b/lib/runtime/src/ops/reduce.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #include "op-attrs/ops/reduce.h" -#include "task_spec/op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { From bf95bc45170edc11c33ca8ffa413511316c62395 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 21:33:40 +0000 Subject: [PATCH 07/13] update the redudce --- lib/kernels/include/kernels/reduce_kernels.h | 13 +++-- lib/kernels/src/cuda/reduce_kernels.cu | 53 ++++++++++++-------- lib/runtime/src/ops/reduce.cc | 24 +++++---- 3 files changed, 52 insertions(+), 38 deletions(-) diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 7d7f2fc7f0..9db7bd9aea 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -14,7 +14,7 @@ struct ReducePerDeviceState { size_t reduction_size; }; -FF_VISITABLE_STRUCT_NO_EQ(ReducePerDeviceState, +FF_VISITABLE_STRUCT(ReducePerDeviceState, handle, inputTensor, outputTensor, @@ -26,17 +26,16 @@ namespace Kernels { namespace Reduce { ReducePerDeviceState init_kernel(PerDeviceFFhandle const &, - ffTensorDescriptor_t const &, - ffTensorDescriptor_t const &, - ffReduceTensorDescriptor_t const &, OperatorType const &, - size_t const &); + size_t const &, + ArrayShape input_shape, + ArrayShape output_shape); -void forward_kernel_wrapper(ReducePerDeviceState const *m, +void forward_kernel_wrapper(ReducePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void backward_kernel_wrapper(ReducePerDeviceState const *m, +void backward_kernel_wrapper(ReducePerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu index 87e3be991c..b7a5c824f2 100644 --- a/lib/kernels/src/cuda/reduce_kernels.cu +++ b/lib/kernels/src/cuda/reduce_kernels.cu @@ -68,59 +68,70 @@ namespace Kernels { namespace Reduce { ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle, - ffTensorDescriptor_t const &input_tensor, - ffTensorDescriptor_t const &outputTensor, - ffReduceTensorDescriptor_t const &reduceDesc, OperatorType const &op_type, - size_t const &reduction_size) { - return { - handle, input_tensor, outputTensor, reduceDesc, op_type, reduction_size}; + size_t const &reduction_size, + ArrayShape const & input_shape, + ArrayShape const & output_shape) { + + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffReduceTensorDescriptor_t reduceDesc; + + checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));; + checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); + + checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape)); + checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + + ReducePerDeviceState per_device = {handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size}; + } void forward_kernel(cudaStream_t stream, - ReducePerDeviceState const *m, + ReducePerDeviceState const &m, float const *input_ptr, float *output_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnReduceTensor(m->handle.dnn, - m->reduceDesc, + checkCUDNN(cudnnReduceTensor(m.handle.dnn, + m.reduceDesc, nullptr /*indices*/, 0 /*indicesSizeInBytes*/, - m->handle.workSpace, - m->handle.workSpaceSize, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); }; void backward_kernel(cudaStream_t stream, - ReducePerDeviceState const *m, + ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0, beta = 1.0f; - switch (m->op_type) { + switch (m.op_type) { case OP_REDUCE_SUM: alpha = 1.0f; break; case OP_REDUCE_MEAN: // When the output is the average of multiple input elements // we need to scale the gradients by 1.0 / reduction_size - alpha = 1.0f / m->reduction_size; + alpha = 1.0f / m.reduction_size; break; default: assert(false); } - checkCUDNN(cudnnAddTensor(m->handle.dnn, + checkCUDNN(cudnnAddTensor(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, &beta, - m->inputTensor, + m.inputTensor, input_grad_ptr)); } diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index c06a75c7da..59ce5f9973 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -62,7 +62,10 @@ OpTaskInvocation init(TransposeAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(HANDLE, ff_handle()); - .binding.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); + + binding.bind(INPUT, input_parallel_tensor_shape(0)); + binding.bind(OUTPUT, output_parallel_tensor_shape(0)); return {REDUCE_INIT_TASK_ID, binding}; } @@ -71,14 +74,15 @@ static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto attrs = acc.get_argument(ATTRS); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + OperatorType = attrs.op_type; // Note: How to set the reduction size? - ffTensorDescriptor_t inputTensor; - ffTensorDescriptor_t outputTensor; - ffReduceTensorDescriptor_t reduceDesc; - size_t reduction_size DeviceSpecific per_device_state = + size_t reduction_size ; + DeviceSpecific per_device_state = acc.create_device_specific(init_kernel( - handle, input, output, reduce_desc, op_type, reduction_size)); + handle, op_type, reduction_size, input.shape, output.shape)); return per_device_state; } @@ -107,7 +111,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) { bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); bind.bind_arg(PROFILING, profiling_tensor()); - binding.bind(INPUT, input_tensor(0)); + binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); return {REDUCE_FWD_TASK_ID, binding}; @@ -124,7 +128,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, "[Reduce] forward_time = %.2lfms\n", - &per_device_state, + per_device_state, input.get_float_ptr(), output.get_float_ptr()); } @@ -142,7 +146,7 @@ void register_task() { OpTaskSignature fwd(OpTaskType::FORWARD); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); @@ -167,7 +171,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, "[Reduce] backward_time = %.2lfms\n", - &per_device_state, + per_device_state, input.get_float_ptr(), output.get_float_ptr()); } From be032a196ee851e14a90d0811a4c9b9e39fd4be2 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 21:34:27 +0000 Subject: [PATCH 08/13] update the redudce --- lib/runtime/src/ops/reduce.cc | 389 ---------------------------------- 1 file changed, 389 deletions(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 59ce5f9973..bfcf64c186 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -226,393 +226,4 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, return make_metrics(forward_time, backward_time, sync_time, env); } -// Tensor FFModel::reduce_sum(OperatorType op, -// const Tensor input, -// std::vector const &_axes, -// bool keepdims, -// char const *name) { -// Layer *rd = new Layer(this, -// op, -// DT_FLOAT, -// name, -// 1 /*input*/, -// 0 /*weights*/, -// 1 /*outputs*/, -// input); -// // Use Legion indexing to store axes -// std::vector axes; -// for (size_t i = 0; i < _axes.size(); i++) { -// axes.push_back(input->num_dims - 1 - _axes[i]); -// } -// int dims[MAX_TENSOR_DIM]; -// int numdim = input->num_dims; -// if (keepdims) { -// for (int i = 0; i < input->num_dims; i++) { -// dims[i] = input->dims[i]; -// } -// for (size_t i = 0; i < axes.size(); i++) { -// dims[axes[i]] = 1; -// } -// } else { -// numdim = 0; -// for (int i = 0; i < input->num_dims; i++) { -// bool reduced = false; -// for (size_t j = 0; j < axes.size(); j++) { -// if (axes[j] == i) { -// reduced = true; -// } -// } -// if (!reduced) { -// dims[numdim++] = input->dims[i]; -// } -// } -// assert(numdim + axes.size() == input->num_dims); -// } -// rd->outputs[0] = create_tensor_legion_ordering( -// numdim, dims, input->data_type, rd, 0, true /*create_grad*/); -// rd->add_int_vector_property("legion_axes", axes); -// rd->add_int_property("keepdims", keepdims); -// layers.push_back(rd); -// return rd->outputs[0]; -// } - -// Tensor FFModel::reduce_sum(const Tensor input, -// std::vector const &_axes, -// bool keepdims, -// char const *name) { -// return this->reduce(OP_REDUCE_SUM, input, _axes, keepdims, name); -// } - -// Tensor FFModel::reduce_mean(const Tensor input, -// std::vector const &_axes, -// bool keepdims, -// char const *name) { -// return this->reduce(OP_REDUCE_MEAN, input, _axes, keepdims, name); -// } - -// Op *Reduce::create_operator_from_layer( -// FFModel &model, -// Layer const *layer, -// std::vector const &inputs) { -// std::vector axes; -// long long value; -// layer->get_int_vector_property("legion_axes", axes); -// layer->get_int_property("keepdims", value); -// bool keepdims = value; -// return new Reduce( -// model, layer->op_type, inputs[0], axes, keepdims, layer->name); -// } - -// Reduce::Reduce(FFModel &model, -// ReduceParams const ¶ms, -// const ParallelTensor input, -// char const *name) -// : Reduce(model, params.op_type, input, params.axes, params.keepdims, -// name) { -// } - -// Reduce::Reduce(FFModel &model, -// OperatorType _op_type, -// const ParallelTensor input, -// std::vector const &_axes, -// bool _keepdims, -// char const *name) -// : Op(model, -// _op_type, -// input->data_type, -// name, -// 1 /*inputs*/, -// 0 /*weights*/, -// 1 /*outputs*/, -// input), -// num_axes(_axes.size()), keepdims(_keepdims) { -// for (size_t i = 0; i < num_axes; i++) { -// axes[i] = _axes[i]; -// } -// int num_dims = input->num_dims; -// ParallelDim dims[MAX_TENSOR_DIM]; -// if (keepdims) { -// num_dims = input->num_dims; -// for (int i = 0; i < num_dims; i++) { -// dims[i] = input->dims[i]; -// } -// for (int i = 0; i < num_axes; i++) { -// // Currently assume that we cannot parallelize along reduced dims -// assert(dims[axes[i]].degree == 1); -// dims[axes[i]].size = 1; -// } -// } else { -// num_dims = 0; -// for (int i = 0; i < input->num_dims; i++) { -// bool reduced = false; -// for (int j = 0; j < num_axes; j++) { -// if (axes[j] == i) { -// reduced = true; -// } -// } -// if (!reduced) { -// dims[num_dims++] = input->dims[i]; -// } else { -// // Currently assume that we cannot parallelize along reduced dims -// assert(input->dims[i].degree == 1); -// assert(input->dims[i].parallel_idx == -1); -// } -// } -// } -// outputs[0] = model.create_parallel_tensor_legion_ordering( -// num_dims, dims, input->data_type, this); -// } - -// void Reduce::init(FFModel const &ff) { -// assert(check_output_input_weight_same_parallel_is()); -// parallel_is = outputs[0]->parallel_is; -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_init(ff, argmap); -// IndexLauncher launcher(REDUCE_INIT_TASK_ID, -// parallel_is, -// TaskArgument(this, sizeof(Reduce)), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[0]->region)); -// launcher.add_field(1, FID_DATA); -// FutureMap fm = runtime->execute_index_space(ctx, launcher); -// fm.wait_all_results(); -// set_opmeta_from_futuremap(ff, fm); -// }; - -// PerDeviceOpState *Reduce::init_task(Task const *task, -// std::vector const -// ®ions, Context ctx, Runtime *runtime) -// { -// Reduce *rd = (Reduce *)task->args; -// FFHandler handle = *((FFHandler *)task->local_args); -// GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( -// DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); -// GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( -// DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); -// ReduceMeta *m = new ReduceMeta(handle, rd, input.domain); -// return m; -// } - -// void Reduce::forward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_forward(ff, argmap); -// IndexLauncher launcher(REDUCE_FWD_TASK_ID, -// parallel_is, -// TaskArgument(nullptr, false), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[0]->region)); -// launcher.add_field(1, FID_DATA); -// runtime->execute_index_space(ctx, launcher); -// } - -// void Reduce::forward_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { -// assert(regions.size() == 2); -// assert(task->regions.size() == 2); -// ReduceMeta const *m = *((ReduceMeta **)task->local_args); -// GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( -// DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); -// GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( -// DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); - -// forward_kernel_wrapper(m, input, output); -// } - -// void Reduce::backward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_backward(ff, argmap); -// IndexLauncher launcher(REDUCE_BWD_TASK_ID, -// parallel_is, -// TaskArgument(nullptr, 0), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// // regions[0](I): output_grad -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// outputs[0]->region_grad)); -// launcher.add_field(0, FID_DATA); -// // regions[1](I/O): input_grad -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// inputs[0]->region_grad)); -// launcher.add_field(1, FID_DATA); -// runtime->execute_index_space(ctx, launcher); -// } - -// void Reduce::backward_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { -// assert(regions.size() == 2); -// assert(task->regions.size() == 2); -// ReduceMeta const *m = *((ReduceMeta **)task->local_args); -// GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( -// DT_FLOAT, regions[0], task->regions[0], FID_DATA, ctx, runtime); -// GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( -// DT_FLOAT, regions[1], task->regions[1], FID_DATA, ctx, runtime); -// backward_kernel_wrapper(m, output_grad, input_grad); -// } - -// bool Reduce::measure_operator_cost(Simulator *sim, -// MachineView const &mv, -// CostMetrics &cost_metrics) const { -// ParallelTensorBase sub_input, sub_output; -// if (!outputs[0]->get_sub_tensor(mv, sub_output)) { -// return false; -// } -// if (!inputs[0]->get_sub_tensor(mv, sub_input)) { -// return false; -// } -// ReduceMeta *m = new ReduceMeta(sim->handler, this, sub_input.get_domain()); -// sim->free_all(); -// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), -// DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorR -// input_acc( -// inputs[0]->data_type, sub_input.get_domain(), input_ptr); - -// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), -// DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW -// output_acc( -// outputs[0]->data_type, sub_output.get_domain(), output_ptr); - -// assert(m->profiling == false); - -// std::function forward, backward; -// forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; -// if (sim->computationMode == COMP_MODE_TRAINING) { -// float *input_grad_ptr = -// (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); -// assert(input_grad_ptr != NULL); -// cost_metrics.inputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); GenericTensorAccessorW -// input_grad_acc( -// inputs[0]->data_type, sub_input.get_domain(), input_grad_ptr); - -// float *output_grad_ptr = -// (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); -// assert(output_grad_ptr != NULL); -// cost_metrics.outputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); -// GenericTensorAccessorR output_grad_acc( -// outputs[0]->data_type, sub_output.get_domain(), output_grad_ptr); - -// backward = [&] { -// backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); -// }; -// } - -// inner_measure_operator_cost(sim, forward, backward, cost_metrics); - -// if (sim->computationMode == COMP_MODE_TRAINING) { -// printf("[Measure Reduce] name(%s) forward_time(%.4lf) " -// "backward_time(%.4lf)\n", -// name, -// cost_metrics.forward_time, -// cost_metrics.backward_time); -// } else { -// printf("[Measure Reduce] name(%s) forward_time(%.4lf)\n", -// name, -// cost_metrics.forward_time); -// } - -// return true; -// } - -// void Reduce::serialize(Legion::Serializer &sez) const { -// ReduceParams params = get_params(); -// sez.serialize(params.op_type); -// sez.serialize(params.axes.size()); -// for (size_t i = 0; i < params.axes.size(); i++) { -// sez.serialize(params.axes[i]); -// } -// sez.serialize(params.keepdims); -// } - -// using PCG::Node; -// Node Reduce::deserialize(FFModel &ff, -// Legion::Deserializer &dez, -// ParallelTensor inputs[], -// int num_inputs) { -// assert(num_inputs == 1); -// OperatorType op_type; -// size_t axes_size; -// bool keepdims; -// std::vector axes; -// dez.deserialize(op_type); -// dez.deserialize(axes_size); -// for (size_t i = 0; i < axes_size; i++) { -// int dim_idx; -// dez.deserialize(dim_idx); -// axes.push_back(dim_idx); -// } -// dez.deserialize(keepdims); -// return ff.get_or_create_node(inputs[0], {axes, op_type, keepdims}); -// } - -// Op *Reduce::materialize(FFModel &ff, -// ParallelTensor inputs[], -// int num_inputs) const { -// ReduceParams params = get_params(); -// return new Reduce(ff, params, inputs[0], this->name); -// } - -// }; // namespace FlexFlow - -// namespace std { -// size_t hash::operator()( -// FlexFlow::ReduceParams const ¶ms) const { -// size_t key = 0; -// hash_combine(key, params.op_type); -// hash_combine(key, params.axes.size()); -// for (int n : params.axes) { -// hash_combine(key, n); -// } -// hash_combine(key, params.keepdims); -// return key; -// } }; // namespace FlexFlow From bb542fdcb36e8cc41781c4248b103d97005188af Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 27 Sep 2023 20:38:34 +0000 Subject: [PATCH 09/13] use exceptions in reduces.cc --- lib/runtime/src/ops/reduce.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index bfcf64c186..fdf381910e 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -2,7 +2,7 @@ #include "kernels/reduce_kernels.h" #include "legion/legion_utilities.h" #include "op-attrs/get_output_shape.h" -#include "utils/exception.decl.h" +#include "utils/exceptions.h" #include "utils/hash-utils.h" #include "utils/type_traits_core.h" From ae22a95b621199cf20fbbecc8b6decb933e0d1f2 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sat, 30 Sep 2023 13:16:53 +0000 Subject: [PATCH 10/13] fix the typo --- lib/runtime/src/ops/reduce.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index fdf381910e..49b1be4f1e 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -95,10 +95,10 @@ static DeviceSpecific return init_task_impl(acc); } -plate<> void register_task() { +template<> void register_task() { OpTaskSignature init(OpTaskType::INIT) - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); From 0573b8874f4ac3c378b08993b64389911f8ca31b Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 10 Oct 2023 20:34:26 +0000 Subject: [PATCH 11/13] fix the reduce.cc --- lib/runtime/src/ops/reduce.cc | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 49b1be4f1e..fe7130ca88 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -25,29 +25,6 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reduce; -bool operator==(ReduceParams const &lhs, ReduceParams const &rhs) { - return (lhs.axes == rhs.axes) && (lhs.keepdims == rhs.keepdims); -} - -bool ReduceParams::is_valid(ParallelTensorShape const &input) const { - for (size_t i = 0; i < axes.size(); i++) { - if (axes[i] >= input.num_dims) { - return false; - } - } - return input.is_valid(); -} - -ReduceParams Reduce::get_params() const { - ReduceParams params; - params.axes.clear(); - for (int i = 0; i < num_axes; i++) { - params.axes.push_back(this->axes[i]); - } - params.keepdims = keepdims; - return params; -} - enum Slots { INPUT, OUTPUT, @@ -64,8 +41,8 @@ OpTaskInvocation init(TransposeAttrs const &attrs) { binding.bind_arg(HANDLE, ff_handle()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_parallel_tensor_shape(0)); - binding.bind(OUTPUT, output_parallel_tensor_shape(0)); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); return {REDUCE_INIT_TASK_ID, binding}; } @@ -101,6 +78,8 @@ template<> void register_task() { init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); + init.add_return_value(); + register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); } @@ -111,7 +90,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) { bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); bind.bind_arg(PROFILING, profiling_tensor()); - binding.bind(INPUT, input_parallel_tensor_shape(0)); + binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); return {REDUCE_FWD_TASK_ID, binding}; @@ -209,7 +188,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SimTaskBinding fwd_binding; ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - fwd.bind(INPUT, input); + fwd.bind(INPUT, input.shape); fwd.bind(OUTPUT, output_shape); fwd.bind_arg(PROFILING, settings); fwd.bind_arg(PER_DEVICE_STATE, per_device_state); From dba438d992723090bc8412f08c8555b64946f2ca Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 10 Oct 2023 20:34:51 +0000 Subject: [PATCH 12/13] format the code --- lib/kernels/include/kernels/reduce_kernels.h | 12 ++++++------ lib/kernels/src/cuda/reduce_kernels.cu | 16 +++++++++------- lib/runtime/src/ops/reduce.cc | 7 ++++--- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 9db7bd9aea..1220a93203 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -15,12 +15,12 @@ struct ReducePerDeviceState { }; FF_VISITABLE_STRUCT(ReducePerDeviceState, - handle, - inputTensor, - outputTensor, - reduceDesc, - op_type, - reduction_size); + handle, + inputTensor, + outputTensor, + reduceDesc, + op_type, + reduction_size); namespace Kernels { namespace Reduce { diff --git a/lib/kernels/src/cuda/reduce_kernels.cu b/lib/kernels/src/cuda/reduce_kernels.cu index b7a5c824f2..a67f52999d 100644 --- a/lib/kernels/src/cuda/reduce_kernels.cu +++ b/lib/kernels/src/cuda/reduce_kernels.cu @@ -70,22 +70,24 @@ namespace Reduce { ReducePerDeviceState init_kernel(PerDeviceFFhandle const &handle, OperatorType const &op_type, size_t const &reduction_size, - ArrayShape const & input_shape, - ArrayShape const & output_shape) { - + ArrayShape const &input_shape, + ArrayShape const &output_shape) { + ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; ffReduceTensorDescriptor_t reduceDesc; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); - checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));; + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + ; checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape)); - checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); - - ReducePerDeviceState per_device = {handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size}; + checkCUDNN( + cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + ReducePerDeviceState per_device = { + handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size}; } void forward_kernel(cudaStream_t stream, diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index fe7130ca88..6932e645b4 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -56,7 +56,7 @@ static DeviceSpecific OperatorType = attrs.op_type; // Note: How to set the reduction size? - size_t reduction_size ; + size_t reduction_size; DeviceSpecific per_device_state = acc.create_device_specific(init_kernel( handle, op_type, reduction_size, input.shape, output.shape)); @@ -72,10 +72,11 @@ static DeviceSpecific return init_task_impl(acc); } -template<> void register_task() { +template <> +void register_task() { OpTaskSignature init(OpTaskType::INIT) - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); init.add_return_value(); From 925356e46e508b5971b23a27d526b448fbc62c31 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 29 Oct 2023 00:01:51 +0000 Subject: [PATCH 13/13] add reduction size --- lib/runtime/src/ops/reduce.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/reduce.cc b/lib/runtime/src/ops/reduce.cc index 6932e645b4..2674dc4fef 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/runtime/src/ops/reduce.cc @@ -56,7 +56,7 @@ static DeviceSpecific OperatorType = attrs.op_type; // Note: How to set the reduction size? - size_t reduction_size; + size_t reduction_size = input.shape.get_volume() / output.shape.get_volume(); DeviceSpecific per_device_state = acc.create_device_specific(init_kernel( handle, op_type, reduction_size, input.shape, output.shape));