From cd8032dd345812a0f5a484b4abbe4e88a0e5debb Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 7 Sep 2023 08:33:43 +0000 Subject: [PATCH 01/10] start split OP --- deps/fmt | 2 +- lib/runtime/src/ops/split.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deps/fmt b/deps/fmt index f5e54359df..a33701196a 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 +Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 diff --git a/lib/runtime/src/ops/split.h b/lib/runtime/src/ops/split.h index d115c149b8..190ba03e25 100644 --- a/lib/runtime/src/ops/split.h +++ b/lib/runtime/src/ops/split.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_SPLIT_H #include "op-attrs/ops/split.h" -#include "op_task_invocation.h" +#include "task_spec/op_task_invocation.h" #include "sim_environment.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input_shape, ProfilingSettings const &settings, MachineView const &machine_view); From 59a1baa3fbcb4d5a6124b3cb1a22b4ba0ec5e5f0 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 7 Sep 2023 08:42:10 +0000 Subject: [PATCH 02/10] add methond interface --- lib/runtime/src/ops/split.cc | 742 ++++++++++++++++++++--------------- lib/runtime/src/ops/split.h | 2 +- 2 files changed, 416 insertions(+), 328 deletions(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index 195ab7bef4..f155a14d04 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -15,6 +15,7 @@ #include "split.h" #include "kernels/split_kernels.h" +#include "utils/exception.decl.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -52,357 +53,444 @@ SplitParams Split::get_params() const { return params; } -void FFModel::split(const Tensor input, - Tensor *outputs, - std::vector const &splits, - int axis, - char const *name) { - Layer *split = new Layer(this, - OP_SPLIT, - DT_FLOAT, - name, - 1 /*inputs*/, - 0 /*weights*/, - splits.size() /*outputs*/, - input); - int numdim = input->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = input->dims[i]; - } - for (size_t i = 0; i < splits.size(); i++) { - dims[numdim - axis - 1] = splits[i]; - split->outputs[i] = create_tensor_legion_ordering( - numdim, dims, input->data_type, split, 0, true /*create_grad*/); - outputs[i] = split->outputs[i]; - } - split->add_int_property("legion_axis", numdim - axis - 1); - layers.push_back(split); -} +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; -Op *Split::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("legion_axis", value); - int legion_axis = value; - std::vector splits; - for (int i = 0; i < layer->numOutputs; i++) { - splits.push_back(layer->outputs[i]->dims[legion_axis]); - } - assert(inputs.size() == 1); - return new Split(model, inputs[0], splits, legion_axis, layer->name); -} +OpTaskInvocation init(SplitAttrs const & attr) { + OpTaskBinding binding; -Split::Split(FFModel &model, - const ParallelTensor input, - std::vector const &splits, - int _legion_axis, - char const *name) - : Op(model, - OP_SPLIT, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - splits.size() /*outputs*/, - input), - legion_axis(_legion_axis), splits(splits) { - numOutputs = splits.size(); - // Note that we use the Legion dim ordering - assert(legion_axis >= 0); - numWeights = 0; - int split_size = 0; - for (int i = 0; i < numOutputs; i++) { - split_size += splits[i]; - int numdim = input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int j = 0; j < numdim; j++) { - dims[j] = input->dims[j]; - } - dims[legion_axis].size = splits[i]; - // Assert the _axis dim cannot be parallelized - assert(dims[legion_axis].degree == 1); - assert(dims[legion_axis].parallel_idx == -1); - outputs[i] = model.create_parallel_tensor_legion_ordering( - numdim, dims, input->data_type, this /*owner_op*/, i /*owner_idx*/); - } - // Check split sizes - assert(split_size == input->dims[legion_axis].size); -} + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); -Split::Split(FFModel &model, - SplitParams const ¶ms, - const ParallelTensor input, - char const *name) - : Split(model, input, params.splits, params.legion_axis, name) {} - -void Split::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(SPLIT_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Split)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - for (int i = 0; i < numOutputs; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); - launcher.add_field(i + 1, FID_DATA); - } - runtime->execute_index_space(ctx, launcher); + return {SPLIT_INIT_TASK_ID, binding}; } -PerDeviceOpState *Split::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - return NULL; +OpTaskInvocation forward(SplitAttrs const & attrs) { + OpTaskBinding binding; + + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + + return {SPLIT_FWD_TASK_ID, binding}; } -void Split::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(SPLIT_FWD_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Split)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - for (int i = 0; i < numOutputs; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); - launcher.add_field(i + 1, FID_DATA); - } - runtime->execute_index_space(ctx, launcher); +OpTaskInvocation backward(SplitAttrs const & attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); + + return {SPLIT_BWD_TASK_ID, binding}; } -void calc_block_size(coord_t &num_blks, - coord_t &blk_size, - Domain const &domain, - int axis) { - num_blks = 1; - blk_size = 1; - for (int d = 0; d < domain.get_dim(); d++) { - if (d <= axis) { - blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); - } else { - num_blks *= (domain.hi()[d] - domain.lo()[d] + 1); - } - } +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + NOT_IM } -void Split::forward_task(Task const *task, +static void forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Split const *split = (Split *)task->args; - assert(regions.size() == split->numOutputs + 1); - assert(task->regions.size() == split->numOutputs + 1); - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - float *out_ptr[MAX_NUM_OUTPUTS]; - size_t total_volume = 0; - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, in_domain, split->legion_axis); - for (int i = 0; i < split->numOutputs; i++) { - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[i + 1].region.get_index_space()); - out_ptr[i] = helperGetTensorPointerWO( - regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); - coord_t out_num_blks; - calc_block_size( - out_num_blks, out_blk_size[i], out_domain, split->legion_axis); - assert(out_num_blks == num_blks); - for (int j = 0; j < out_domain.get_dim(); j++) { - if (j != split->legion_axis) { - assert(out_domain.hi()[j] == in_domain.hi()[j]); - assert(out_domain.lo()[j] == in_domain.lo()[j]); - } - } - total_volume += out_domain.get_volume(); - } - assert(total_volume == in_domain.get_volume()); - - forward_kernel_wrapper( - out_ptr, in_ptr, out_blk_size, in_blk_size, num_blks, split->numOutputs); + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); } -void Split::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(SPLIT_BWD_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Split)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - for (int i = 0; i < numOutputs; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[i]->region_grad)); - launcher.add_field(i + 1, FID_DATA); - } - runtime->execute_index_space(ctx, launcher); +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + NOT_IMPLEMENTED(); } -void Split::backward_task(Task const *task, +static void backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Split const *split = (Split *)task->args; - assert(regions.size() == split->numOutputs + 1); - assert(task->regions.size() == split->numOutputs + 1); - Domain in_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - float const *out_grad_ptr[MAX_NUM_OUTPUTS]; - size_t total_volume = 0; - float *in_grad_ptr = helperGetTensorPointerRW( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, in_grad_domain, split->legion_axis); - for (int i = 0; i < split->numOutputs; i++) { - Domain out_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[i + 1].region.get_index_space()); - out_grad_ptr[i] = helperGetTensorPointerRO( - regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); - coord_t out_num_blks; - calc_block_size( - out_num_blks, out_blk_size[i], out_grad_domain, split->legion_axis); - assert(out_num_blks == num_blks); - for (int j = 0; j < out_grad_domain.get_dim(); j++) { - if (j != split->legion_axis) { - assert(out_grad_domain.hi()[j] == in_grad_domain.hi()[j]); - assert(out_grad_domain.lo()[j] == in_grad_domain.lo()[j]); - } - } - total_volume += out_grad_domain.get_volume(); - } - assert(total_volume == in_grad_domain.get_volume()); - - backward_kernel_wrapper(in_grad_ptr, - out_grad_ptr, - out_blk_size, - in_blk_size, - num_blks, - split->numOutputs); + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); } -bool Split::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output[MAX_NUM_OUTPUTS], sub_input; - for (int i = 0; i < numOutputs; i++) { - if (!outputs[i]->get_sub_tensor(mv, sub_output[i])) { - return false; - } - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - Domain in_domain = sub_input.get_domain(); - sim->free_all(); - float *output_ptr[MAX_NUM_OUTPUTS]; - size_t total_volume = 0; - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, in_domain, legion_axis); - for (int i = 0; i < numOutputs; i++) { - Domain out_domain = sub_output[i].get_domain(); - output_ptr[i] = - (float *)sim->allocate(sub_output[i].get_volume(), DT_FLOAT); - coord_t out_num_blks; - calc_block_size(out_num_blks, out_blk_size[i], out_domain, legion_axis); - assert(out_num_blks == num_blks); - for (int j = 0; j < out_domain.get_dim(); j++) { - if (j != legion_axis) { - assert(out_domain.hi()[j] == in_domain.hi()[j]); - assert(out_domain.lo()[j] == in_domain.lo()[j]); - } - } - total_volume += out_domain.get_volume(); - } - assert(total_volume == in_domain.get_volume()); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper( - output_ptr, input_ptr, out_blk_size, in_blk_size, num_blks, numOutputs); - }; - // Assume backward has the same cost as forward - backward = forward; - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf) " - "backward_time(%.4lf)\n", - name, - sub_input.get_volume(), - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf)\n", - name, - sub_input.get_volume(), - cost_metrics.forward_time); - } - return true; +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + SplitAttrs const &attrs, + InputParallelTensorDes const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + NOT_IMPLEMENTED(); } -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::SplitParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.splits.size()); - for (int n : params.splits) { - hash_combine(key, n); - } - hash_combine(key, params.legion_axis); - return key; +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT); + + init.add_input_slot(INPUT); + init.add_output_slot(OUTPUT); + //TODO Note: split operator does not need SplitDeviceState, how to register the init_task and how to implement the init_task like cast OP? + } + +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_arg_slot(PROFILING); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task); +} + +template <> +void register_task() { + OpTaskSignature bwd = infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); + + register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task); +} + +// void FFModel::split(const Tensor input, +// Tensor *outputs, +// std::vector const &splits, +// int axis, +// char const *name) { +// Layer *split = new Layer(this, +// OP_SPLIT, +// DT_FLOAT, +// name, +// 1 /*inputs*/, +// 0 /*weights*/, +// splits.size() /*outputs*/, +// input); +// int numdim = input->num_dims; +// int dims[MAX_TENSOR_DIM]; +// for (int i = 0; i < numdim; i++) { +// dims[i] = input->dims[i]; +// } +// for (size_t i = 0; i < splits.size(); i++) { +// dims[numdim - axis - 1] = splits[i]; +// split->outputs[i] = create_tensor_legion_ordering( +// numdim, dims, input->data_type, split, 0, true /*create_grad*/); +// outputs[i] = split->outputs[i]; +// } +// split->add_int_property("legion_axis", numdim - axis - 1); +// layers.push_back(split); +// } + +// Op *Split::create_operator_from_layer( +// FFModel &model, +// Layer const *layer, +// std::vector const &inputs) { +// long long value; +// layer->get_int_property("legion_axis", value); +// int legion_axis = value; +// std::vector splits; +// for (int i = 0; i < layer->numOutputs; i++) { +// splits.push_back(layer->outputs[i]->dims[legion_axis]); +// } +// assert(inputs.size() == 1); +// return new Split(model, inputs[0], splits, legion_axis, layer->name); +// } + +// Split::Split(FFModel &model, +// const ParallelTensor input, +// std::vector const &splits, +// int _legion_axis, +// char const *name) +// : Op(model, +// OP_SPLIT, +// input->data_type, +// name, +// 1 /*inputs*/, +// 0 /*weights*/, +// splits.size() /*outputs*/, +// input), +// legion_axis(_legion_axis), splits(splits) { +// numOutputs = splits.size(); +// // Note that we use the Legion dim ordering +// assert(legion_axis >= 0); +// numWeights = 0; +// int split_size = 0; +// for (int i = 0; i < numOutputs; i++) { +// split_size += splits[i]; +// int numdim = input->num_dims; +// ParallelDim dims[MAX_TENSOR_DIM]; +// for (int j = 0; j < numdim; j++) { +// dims[j] = input->dims[j]; +// } +// dims[legion_axis].size = splits[i]; +// // Assert the _axis dim cannot be parallelized +// assert(dims[legion_axis].degree == 1); +// assert(dims[legion_axis].parallel_idx == -1); +// outputs[i] = model.create_parallel_tensor_legion_ordering( +// numdim, dims, input->data_type, this /*owner_op*/, i /*owner_idx*/); +// } +// // Check split sizes +// assert(split_size == input->dims[legion_axis].size); +// } + +// Split::Split(FFModel &model, +// SplitParams const ¶ms, +// const ParallelTensor input, +// char const *name) +// : Split(model, input, params.splits, params.legion_axis, name) {} + +// void Split::init(FFModel const &ff) { +// assert(check_output_input_weight_same_parallel_is()); +// parallel_is = outputs[0]->parallel_is; +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// IndexLauncher launcher(SPLIT_INIT_TASK_ID, +// parallel_is, +// TaskArgument(this, sizeof(Split)), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// for (int i = 0; i < numOutputs; i++) { +// launcher.add_region_requirement(RegionRequirement(outputs[i]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[i]->region)); +// launcher.add_field(i + 1, FID_DATA); +// } +// runtime->execute_index_space(ctx, launcher); +// } + +// PerDeviceOpState *Split::init_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// return NULL; +// } + +// void Split::forward(FFModel const &ff) { +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// IndexLauncher launcher(SPLIT_FWD_TASK_ID, +// parallel_is, +// TaskArgument(this, sizeof(Split)), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// for (int i = 0; i < numOutputs; i++) { +// launcher.add_region_requirement(RegionRequirement(outputs[i]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[i]->region)); +// launcher.add_field(i + 1, FID_DATA); +// } +// runtime->execute_index_space(ctx, launcher); +// } + +// void calc_block_size(coord_t &num_blks, +// coord_t &blk_size, +// Domain const &domain, +// int axis) { +// num_blks = 1; +// blk_size = 1; +// for (int d = 0; d < domain.get_dim(); d++) { +// if (d <= axis) { +// blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); +// } else { +// num_blks *= (domain.hi()[d] - domain.lo()[d] + 1); +// } +// } +// } + +// void Split::forward_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// Split const *split = (Split *)task->args; +// assert(regions.size() == split->numOutputs + 1); +// assert(task->regions.size() == split->numOutputs + 1); +// Domain in_domain = runtime->get_index_space_domain( +// ctx, task->regions[0].region.get_index_space()); +// float *out_ptr[MAX_NUM_OUTPUTS]; +// size_t total_volume = 0; +// float const *in_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; +// calc_block_size(num_blks, in_blk_size, in_domain, split->legion_axis); +// for (int i = 0; i < split->numOutputs; i++) { +// Domain out_domain = runtime->get_index_space_domain( +// ctx, task->regions[i + 1].region.get_index_space()); +// out_ptr[i] = helperGetTensorPointerWO( +// regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); +// coord_t out_num_blks; +// calc_block_size( +// out_num_blks, out_blk_size[i], out_domain, split->legion_axis); +// assert(out_num_blks == num_blks); +// for (int j = 0; j < out_domain.get_dim(); j++) { +// if (j != split->legion_axis) { +// assert(out_domain.hi()[j] == in_domain.hi()[j]); +// assert(out_domain.lo()[j] == in_domain.lo()[j]); +// } +// } +// total_volume += out_domain.get_volume(); +// } +// assert(total_volume == in_domain.get_volume()); + +// forward_kernel_wrapper( +// out_ptr, in_ptr, out_blk_size, in_blk_size, num_blks, split->numOutputs); +// } + +// void Split::backward(FFModel const &ff) { +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// IndexLauncher launcher(SPLIT_BWD_TASK_ID, +// parallel_is, +// TaskArgument(this, sizeof(Split)), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, +// 0 /*projection id*/, +// READ_WRITE, +// EXCLUSIVE, +// inputs[0]->region_grad)); +// launcher.add_field(0, FID_DATA); +// for (int i = 0; i < numOutputs; i++) { +// launcher.add_region_requirement(RegionRequirement(outputs[i]->part_grad, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// outputs[i]->region_grad)); +// launcher.add_field(i + 1, FID_DATA); +// } +// runtime->execute_index_space(ctx, launcher); +// } + +// void Split::backward_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// Split const *split = (Split *)task->args; +// assert(regions.size() == split->numOutputs + 1); +// assert(task->regions.size() == split->numOutputs + 1); +// Domain in_grad_domain = runtime->get_index_space_domain( +// ctx, task->regions[0].region.get_index_space()); +// float const *out_grad_ptr[MAX_NUM_OUTPUTS]; +// size_t total_volume = 0; +// float *in_grad_ptr = helperGetTensorPointerRW( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; +// calc_block_size(num_blks, in_blk_size, in_grad_domain, split->legion_axis); +// for (int i = 0; i < split->numOutputs; i++) { +// Domain out_grad_domain = runtime->get_index_space_domain( +// ctx, task->regions[i + 1].region.get_index_space()); +// out_grad_ptr[i] = helperGetTensorPointerRO( +// regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); +// coord_t out_num_blks; +// calc_block_size( +// out_num_blks, out_blk_size[i], out_grad_domain, split->legion_axis); +// assert(out_num_blks == num_blks); +// for (int j = 0; j < out_grad_domain.get_dim(); j++) { +// if (j != split->legion_axis) { +// assert(out_grad_domain.hi()[j] == in_grad_domain.hi()[j]); +// assert(out_grad_domain.lo()[j] == in_grad_domain.lo()[j]); +// } +// } +// total_volume += out_grad_domain.get_volume(); +// } +// assert(total_volume == in_grad_domain.get_volume()); + +// backward_kernel_wrapper(in_grad_ptr, +// out_grad_ptr, +// out_blk_size, +// in_blk_size, +// num_blks, +// split->numOutputs); +// } + +// bool Split::measure_operator_cost(Simulator *sim, +// MachineView const &mv, +// CostMetrics &cost_metrics) const { +// ParallelTensorBase sub_output[MAX_NUM_OUTPUTS], sub_input; +// for (int i = 0; i < numOutputs; i++) { +// if (!outputs[i]->get_sub_tensor(mv, sub_output[i])) { +// return false; +// } +// } +// if (!inputs[0]->get_sub_tensor(mv, sub_input)) { +// return false; +// } +// Domain in_domain = sub_input.get_domain(); +// sim->free_all(); +// float *output_ptr[MAX_NUM_OUTPUTS]; +// size_t total_volume = 0; +// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); +// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; +// calc_block_size(num_blks, in_blk_size, in_domain, legion_axis); +// for (int i = 0; i < numOutputs; i++) { +// Domain out_domain = sub_output[i].get_domain(); +// output_ptr[i] = +// (float *)sim->allocate(sub_output[i].get_volume(), DT_FLOAT); +// coord_t out_num_blks; +// calc_block_size(out_num_blks, out_blk_size[i], out_domain, legion_axis); +// assert(out_num_blks == num_blks); +// for (int j = 0; j < out_domain.get_dim(); j++) { +// if (j != legion_axis) { +// assert(out_domain.hi()[j] == in_domain.hi()[j]); +// assert(out_domain.lo()[j] == in_domain.lo()[j]); +// } +// } +// total_volume += out_domain.get_volume(); +// } +// assert(total_volume == in_domain.get_volume()); +// cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + +// std::function forward, backward; +// forward = [&] { +// forward_kernel_wrapper( +// output_ptr, input_ptr, out_blk_size, in_blk_size, num_blks, numOutputs); +// }; +// // Assume backward has the same cost as forward +// backward = forward; + +// inner_measure_operator_cost(sim, forward, backward, cost_metrics); +// if (sim->computationMode == COMP_MODE_TRAINING) { +// printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf) " +// "backward_time(%.4lf)\n", +// name, +// sub_input.get_volume(), +// cost_metrics.forward_time, +// cost_metrics.backward_time); +// } else { +// printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf)\n", +// name, +// sub_input.get_volume(), +// cost_metrics.forward_time); +// } +// return true; +// } + +// }; // namespace FlexFlow + +// namespace std { +// size_t hash::operator()( +// FlexFlow::SplitParams const ¶ms) const { +// size_t key = 0; +// hash_combine(key, params.splits.size()); +// for (int n : params.splits) { +// hash_combine(key, n); +// } +// hash_combine(key, params.legion_axis); +// return key; +// } }; // namespace std diff --git a/lib/runtime/src/ops/split.h b/lib/runtime/src/ops/split.h index 190ba03e25..585e0fd73b 100644 --- a/lib/runtime/src/ops/split.h +++ b/lib/runtime/src/ops/split.h @@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, - InputParallelTensorDesc const &input_shape, + InputParallelTensorDes const &input, ProfilingSettings const &settings, MachineView const &machine_view); From 5b036fb4605fb6120d6eefb03a9e6e965494bcd4 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 7 Sep 2023 08:49:55 +0000 Subject: [PATCH 03/10] leave forward_task_impl, backward_task_impl to implement because I don't know how to get in_blk_size, num_blks, numoutputs --- lib/runtime/src/ops/split.cc | 84 ++++++++++++++++++++++++++---------- lib/runtime/src/ops/split.h | 2 +- 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index f155a14d04..e5fd307f6a 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -55,7 +55,7 @@ SplitParams Split::get_params() const { enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; -OpTaskInvocation init(SplitAttrs const & attr) { +OpTaskInvocation init(SplitAttrs const &attr) { OpTaskBinding binding; binding.bind(INPUT, input_tensor(0)); @@ -64,7 +64,7 @@ OpTaskInvocation init(SplitAttrs const & attr) { return {SPLIT_INIT_TASK_ID, binding}; } -OpTaskInvocation forward(SplitAttrs const & attrs) { +OpTaskInvocation forward(SplitAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); @@ -74,14 +74,23 @@ OpTaskInvocation forward(SplitAttrs const & attrs) { return {SPLIT_FWD_TASK_ID, binding}; } -OpTaskInvocation backward(SplitAttrs const & attrs) { +OpTaskInvocation backward(SplitAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); return {SPLIT_BWD_TASK_ID, binding}; } static optional forward_task_impl(TaskArgumentAccessor const &acc) { - NOT_IM + acc.get_argument(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + + // Note: forward_kernel needs parameter Legion::coord_t const + // *out_blk_sizes,Legion::coord_t in_blk_size, Legion::coord_t num_blks, int + // numOutputs how to get these parameter? + NOT_IMPLEMENTED(); } static void forward_task(Task const *task, @@ -92,7 +101,15 @@ static void forward_task(Task const *task, forward_task_impl(acc); } -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + + // Note: backward_kernel needs parameter Legion::coord_t const + // *out_blk_sizes,Legion::coord_t in_blk_size, Legion::coord_t num_blks, int + // numOutputs how to get these parameter? NOT_IMPLEMENTED(); } @@ -109,7 +126,25 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDes const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - NOT_IMPLEMENTED(); + auto env = sim.new_environment(); + + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + + SimTaskBinding fwd_binding; + fwd_binding.bind(INPUT, input); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind_arg(PROFILING, settings); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(SPLIT_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(SPLIT_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } template <> @@ -118,8 +153,8 @@ void register_task() { init.add_input_slot(INPUT); init.add_output_slot(OUTPUT); - //TODO Note: split operator does not need SplitDeviceState, how to register the init_task and how to implement the init_task like cast OP? - + // TODO Note: split operator does not need SplitDeviceState, how to register + // the init_task and how to implement the init_task like cast OP? } template <> @@ -135,7 +170,8 @@ void register_task() { template <> void register_task() { - OpTaskSignature bwd = infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task); } @@ -258,9 +294,8 @@ void register_task() { // } // PerDeviceOpState *Split::init_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { +// std::vector const +// ®ions, Context ctx, Runtime *runtime) { // return NULL; // } @@ -343,7 +378,8 @@ void register_task() { // assert(total_volume == in_domain.get_volume()); // forward_kernel_wrapper( -// out_ptr, in_ptr, out_blk_size, in_blk_size, num_blks, split->numOutputs); +// out_ptr, in_ptr, out_blk_size, in_blk_size, num_blks, +// split->numOutputs); // } // void Split::backward(FFModel const &ff) { @@ -433,11 +469,12 @@ void register_task() { // sim->free_all(); // float *output_ptr[MAX_NUM_OUTPUTS]; // size_t total_volume = 0; -// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); -// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); -// coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; -// calc_block_size(num_blks, in_blk_size, in_domain, legion_axis); -// for (int i = 0; i < numOutputs; i++) { +// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), +// DT_FLOAT); cost_metrics.inputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); coord_t num_blks, +// in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; calc_block_size(num_blks, +// in_blk_size, in_domain, legion_axis); for (int i = 0; i < numOutputs; i++) +// { // Domain out_domain = sub_output[i].get_domain(); // output_ptr[i] = // (float *)sim->allocate(sub_output[i].get_volume(), DT_FLOAT); @@ -453,12 +490,14 @@ void register_task() { // total_volume += out_domain.get_volume(); // } // assert(total_volume == in_domain.get_volume()); -// cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// cost_metrics.outputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); // std::function forward, backward; // forward = [&] { // forward_kernel_wrapper( -// output_ptr, input_ptr, out_blk_size, in_blk_size, num_blks, numOutputs); +// output_ptr, input_ptr, out_blk_size, in_blk_size, num_blks, +// numOutputs); // }; // // Assume backward has the same cost as forward // backward = forward; @@ -472,7 +511,8 @@ void register_task() { // cost_metrics.forward_time, // cost_metrics.backward_time); // } else { -// printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf)\n", +// printf("[Measure Split] name(%s) num_elements(%zu) +// forward_time(%.4lf)\n", // name, // sub_input.get_volume(), // cost_metrics.forward_time); @@ -493,4 +533,4 @@ void register_task() { // hash_combine(key, params.legion_axis); // return key; // } -}; // namespace std +}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/split.h b/lib/runtime/src/ops/split.h index 585e0fd73b..d63212e836 100644 --- a/lib/runtime/src/ops/split.h +++ b/lib/runtime/src/ops/split.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_SPLIT_H #include "op-attrs/ops/split.h" -#include "task_spec/op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { From 1d8ad04ed537e02b10b233a525c37ed8fa56eebd Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Fri, 8 Sep 2023 01:52:32 +0000 Subject: [PATCH 04/10] leave to implement in_blk_size --- lib/runtime/src/ops/split.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index e5fd307f6a..39d63d4158 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -104,9 +104,8 @@ static void forward_task(Task const *task, static optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); - // Note: backward_kernel needs parameter Legion::coord_t const // *out_blk_sizes,Legion::coord_t in_blk_size, Legion::coord_t num_blks, int // numOutputs how to get these parameter? From 8c913bc11b5fefa7ceaa7bb8375f2a0551fce175 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 20:20:59 +0000 Subject: [PATCH 05/10] leave some problem --- lib/runtime/src/ops/split.cc | 363 +---------------------------------- 1 file changed, 3 insertions(+), 360 deletions(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index 39d63d4158..77f7e768b5 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -58,7 +58,7 @@ enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; OpTaskInvocation init(SplitAttrs const &attr) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); + binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); return {SPLIT_INIT_TASK_ID, binding}; @@ -68,7 +68,7 @@ OpTaskInvocation forward(SplitAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); - binding.bind(INPUT, input_tensor(0)); + binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); return {SPLIT_FWD_TASK_ID, binding}; @@ -160,7 +160,7 @@ template <> void register_task() { OpTaskSignature fwd(OpTaskType::FWD); - fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); @@ -175,361 +175,4 @@ void register_task() { register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task); } -// void FFModel::split(const Tensor input, -// Tensor *outputs, -// std::vector const &splits, -// int axis, -// char const *name) { -// Layer *split = new Layer(this, -// OP_SPLIT, -// DT_FLOAT, -// name, -// 1 /*inputs*/, -// 0 /*weights*/, -// splits.size() /*outputs*/, -// input); -// int numdim = input->num_dims; -// int dims[MAX_TENSOR_DIM]; -// for (int i = 0; i < numdim; i++) { -// dims[i] = input->dims[i]; -// } -// for (size_t i = 0; i < splits.size(); i++) { -// dims[numdim - axis - 1] = splits[i]; -// split->outputs[i] = create_tensor_legion_ordering( -// numdim, dims, input->data_type, split, 0, true /*create_grad*/); -// outputs[i] = split->outputs[i]; -// } -// split->add_int_property("legion_axis", numdim - axis - 1); -// layers.push_back(split); -// } - -// Op *Split::create_operator_from_layer( -// FFModel &model, -// Layer const *layer, -// std::vector const &inputs) { -// long long value; -// layer->get_int_property("legion_axis", value); -// int legion_axis = value; -// std::vector splits; -// for (int i = 0; i < layer->numOutputs; i++) { -// splits.push_back(layer->outputs[i]->dims[legion_axis]); -// } -// assert(inputs.size() == 1); -// return new Split(model, inputs[0], splits, legion_axis, layer->name); -// } - -// Split::Split(FFModel &model, -// const ParallelTensor input, -// std::vector const &splits, -// int _legion_axis, -// char const *name) -// : Op(model, -// OP_SPLIT, -// input->data_type, -// name, -// 1 /*inputs*/, -// 0 /*weights*/, -// splits.size() /*outputs*/, -// input), -// legion_axis(_legion_axis), splits(splits) { -// numOutputs = splits.size(); -// // Note that we use the Legion dim ordering -// assert(legion_axis >= 0); -// numWeights = 0; -// int split_size = 0; -// for (int i = 0; i < numOutputs; i++) { -// split_size += splits[i]; -// int numdim = input->num_dims; -// ParallelDim dims[MAX_TENSOR_DIM]; -// for (int j = 0; j < numdim; j++) { -// dims[j] = input->dims[j]; -// } -// dims[legion_axis].size = splits[i]; -// // Assert the _axis dim cannot be parallelized -// assert(dims[legion_axis].degree == 1); -// assert(dims[legion_axis].parallel_idx == -1); -// outputs[i] = model.create_parallel_tensor_legion_ordering( -// numdim, dims, input->data_type, this /*owner_op*/, i /*owner_idx*/); -// } -// // Check split sizes -// assert(split_size == input->dims[legion_axis].size); -// } - -// Split::Split(FFModel &model, -// SplitParams const ¶ms, -// const ParallelTensor input, -// char const *name) -// : Split(model, input, params.splits, params.legion_axis, name) {} - -// void Split::init(FFModel const &ff) { -// assert(check_output_input_weight_same_parallel_is()); -// parallel_is = outputs[0]->parallel_is; -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// IndexLauncher launcher(SPLIT_INIT_TASK_ID, -// parallel_is, -// TaskArgument(this, sizeof(Split)), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// for (int i = 0; i < numOutputs; i++) { -// launcher.add_region_requirement(RegionRequirement(outputs[i]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[i]->region)); -// launcher.add_field(i + 1, FID_DATA); -// } -// runtime->execute_index_space(ctx, launcher); -// } - -// PerDeviceOpState *Split::init_task(Task const *task, -// std::vector const -// ®ions, Context ctx, Runtime *runtime) { -// return NULL; -// } - -// void Split::forward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// IndexLauncher launcher(SPLIT_FWD_TASK_ID, -// parallel_is, -// TaskArgument(this, sizeof(Split)), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// for (int i = 0; i < numOutputs; i++) { -// launcher.add_region_requirement(RegionRequirement(outputs[i]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[i]->region)); -// launcher.add_field(i + 1, FID_DATA); -// } -// runtime->execute_index_space(ctx, launcher); -// } - -// void calc_block_size(coord_t &num_blks, -// coord_t &blk_size, -// Domain const &domain, -// int axis) { -// num_blks = 1; -// blk_size = 1; -// for (int d = 0; d < domain.get_dim(); d++) { -// if (d <= axis) { -// blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); -// } else { -// num_blks *= (domain.hi()[d] - domain.lo()[d] + 1); -// } -// } -// } - -// void Split::forward_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { -// Split const *split = (Split *)task->args; -// assert(regions.size() == split->numOutputs + 1); -// assert(task->regions.size() == split->numOutputs + 1); -// Domain in_domain = runtime->get_index_space_domain( -// ctx, task->regions[0].region.get_index_space()); -// float *out_ptr[MAX_NUM_OUTPUTS]; -// size_t total_volume = 0; -// float const *in_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; -// calc_block_size(num_blks, in_blk_size, in_domain, split->legion_axis); -// for (int i = 0; i < split->numOutputs; i++) { -// Domain out_domain = runtime->get_index_space_domain( -// ctx, task->regions[i + 1].region.get_index_space()); -// out_ptr[i] = helperGetTensorPointerWO( -// regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); -// coord_t out_num_blks; -// calc_block_size( -// out_num_blks, out_blk_size[i], out_domain, split->legion_axis); -// assert(out_num_blks == num_blks); -// for (int j = 0; j < out_domain.get_dim(); j++) { -// if (j != split->legion_axis) { -// assert(out_domain.hi()[j] == in_domain.hi()[j]); -// assert(out_domain.lo()[j] == in_domain.lo()[j]); -// } -// } -// total_volume += out_domain.get_volume(); -// } -// assert(total_volume == in_domain.get_volume()); - -// forward_kernel_wrapper( -// out_ptr, in_ptr, out_blk_size, in_blk_size, num_blks, -// split->numOutputs); -// } - -// void Split::backward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// IndexLauncher launcher(SPLIT_BWD_TASK_ID, -// parallel_is, -// TaskArgument(this, sizeof(Split)), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// inputs[0]->region_grad)); -// launcher.add_field(0, FID_DATA); -// for (int i = 0; i < numOutputs; i++) { -// launcher.add_region_requirement(RegionRequirement(outputs[i]->part_grad, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// outputs[i]->region_grad)); -// launcher.add_field(i + 1, FID_DATA); -// } -// runtime->execute_index_space(ctx, launcher); -// } - -// void Split::backward_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { -// Split const *split = (Split *)task->args; -// assert(regions.size() == split->numOutputs + 1); -// assert(task->regions.size() == split->numOutputs + 1); -// Domain in_grad_domain = runtime->get_index_space_domain( -// ctx, task->regions[0].region.get_index_space()); -// float const *out_grad_ptr[MAX_NUM_OUTPUTS]; -// size_t total_volume = 0; -// float *in_grad_ptr = helperGetTensorPointerRW( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; -// calc_block_size(num_blks, in_blk_size, in_grad_domain, split->legion_axis); -// for (int i = 0; i < split->numOutputs; i++) { -// Domain out_grad_domain = runtime->get_index_space_domain( -// ctx, task->regions[i + 1].region.get_index_space()); -// out_grad_ptr[i] = helperGetTensorPointerRO( -// regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); -// coord_t out_num_blks; -// calc_block_size( -// out_num_blks, out_blk_size[i], out_grad_domain, split->legion_axis); -// assert(out_num_blks == num_blks); -// for (int j = 0; j < out_grad_domain.get_dim(); j++) { -// if (j != split->legion_axis) { -// assert(out_grad_domain.hi()[j] == in_grad_domain.hi()[j]); -// assert(out_grad_domain.lo()[j] == in_grad_domain.lo()[j]); -// } -// } -// total_volume += out_grad_domain.get_volume(); -// } -// assert(total_volume == in_grad_domain.get_volume()); - -// backward_kernel_wrapper(in_grad_ptr, -// out_grad_ptr, -// out_blk_size, -// in_blk_size, -// num_blks, -// split->numOutputs); -// } - -// bool Split::measure_operator_cost(Simulator *sim, -// MachineView const &mv, -// CostMetrics &cost_metrics) const { -// ParallelTensorBase sub_output[MAX_NUM_OUTPUTS], sub_input; -// for (int i = 0; i < numOutputs; i++) { -// if (!outputs[i]->get_sub_tensor(mv, sub_output[i])) { -// return false; -// } -// } -// if (!inputs[0]->get_sub_tensor(mv, sub_input)) { -// return false; -// } -// Domain in_domain = sub_input.get_domain(); -// sim->free_all(); -// float *output_ptr[MAX_NUM_OUTPUTS]; -// size_t total_volume = 0; -// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), -// DT_FLOAT); cost_metrics.inputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); coord_t num_blks, -// in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; calc_block_size(num_blks, -// in_blk_size, in_domain, legion_axis); for (int i = 0; i < numOutputs; i++) -// { -// Domain out_domain = sub_output[i].get_domain(); -// output_ptr[i] = -// (float *)sim->allocate(sub_output[i].get_volume(), DT_FLOAT); -// coord_t out_num_blks; -// calc_block_size(out_num_blks, out_blk_size[i], out_domain, legion_axis); -// assert(out_num_blks == num_blks); -// for (int j = 0; j < out_domain.get_dim(); j++) { -// if (j != legion_axis) { -// assert(out_domain.hi()[j] == in_domain.hi()[j]); -// assert(out_domain.lo()[j] == in_domain.lo()[j]); -// } -// } -// total_volume += out_domain.get_volume(); -// } -// assert(total_volume == in_domain.get_volume()); -// cost_metrics.outputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); - -// std::function forward, backward; -// forward = [&] { -// forward_kernel_wrapper( -// output_ptr, input_ptr, out_blk_size, in_blk_size, num_blks, -// numOutputs); -// }; -// // Assume backward has the same cost as forward -// backward = forward; - -// inner_measure_operator_cost(sim, forward, backward, cost_metrics); -// if (sim->computationMode == COMP_MODE_TRAINING) { -// printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf) " -// "backward_time(%.4lf)\n", -// name, -// sub_input.get_volume(), -// cost_metrics.forward_time, -// cost_metrics.backward_time); -// } else { -// printf("[Measure Split] name(%s) num_elements(%zu) -// forward_time(%.4lf)\n", -// name, -// sub_input.get_volume(), -// cost_metrics.forward_time); -// } -// return true; -// } - -// }; // namespace FlexFlow - -// namespace std { -// size_t hash::operator()( -// FlexFlow::SplitParams const ¶ms) const { -// size_t key = 0; -// hash_combine(key, params.splits.size()); -// for (int n : params.splits) { -// hash_combine(key, n); -// } -// hash_combine(key, params.legion_axis); -// return key; -// } }; // namespace FlexFlow From 22200da388e039ed13c75013cfe76181c5803335 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 27 Sep 2023 20:35:05 +0000 Subject: [PATCH 06/10] use exceptions --- lib/runtime/src/ops/split.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index 77f7e768b5..142fbf2387 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -15,7 +15,7 @@ #include "split.h" #include "kernels/split_kernels.h" -#include "utils/exception.decl.h" +#include "utils/exceptions.h" #include "utils/hash-utils.h" namespace FlexFlow { From 094ea6102f56bb16542746b3d186c90458d7c8c2 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 3 Oct 2023 15:00:27 +0000 Subject: [PATCH 07/10] implement the split and use cal_block_size --- lib/runtime/src/ops/split.cc | 103 +++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index 142fbf2387..e371f08a4d 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -14,6 +14,7 @@ */ #include "split.h" +#include "kernels/array_shape.h" #include "kernels/split_kernels.h" #include "utils/exceptions.h" #include "utils/hash-utils.h" @@ -38,36 +39,13 @@ using PCG::Node; using namespace FlexFlow::Kernels::Split; -bool operator==(SplitParams const &lhs, SplitParams const &rhs) { - return lhs.splits == rhs.splits && lhs.legion_axis == rhs.legion_axis; -} - -bool SplitParams::is_valid(ParallelTensorShape const &input) const { - return input.is_valid(); -} - -SplitParams Split::get_params() const { - SplitParams params; - params.splits = this->splits; - params.legion_axis = this->legion_axis; - return params; -} - enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; -OpTaskInvocation init(SplitAttrs const &attr) { - OpTaskBinding binding; - - binding.bind(INPUT, input_parallel_tensor_shape(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {SPLIT_INIT_TASK_ID, binding}; -} - OpTaskInvocation forward(SplitAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(ATTRS, attrs); binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -83,14 +61,27 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { static optional forward_task_impl(TaskArgumentAccessor const &acc) { acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - - // Note: forward_kernel needs parameter Legion::coord_t const - // *out_blk_sizes,Legion::coord_t in_blk_size, Legion::coord_t num_blks, int - // numOutputs how to get these parameter? - NOT_IMPLEMENTED(); + auto attrs = acc.get_argument(ATTRS); + + coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; + calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis); + + for (int i = 0; i < attrs.splits.size(); i++) { + coord_t out_num_blks; + calc_block_size( + out_num_blks, out_blk_size[i], output.shape, split->legion_axis); + } + return profile(forward_kernel, + profiling, + "Split forward_time = %.2lfms\n", + &output.get_float_ptr(), + input.get_float_ptr(), + out_blk_size, + in_blk_size, + num_blks, + attrs.splits.size()); } static void forward_task(Task const *task, @@ -101,15 +92,45 @@ static void forward_task(Task const *task, forward_task_impl(acc); } +// maybe we should add assert like the original code static optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); - // Note: backward_kernel needs parameter Legion::coord_t const - // *out_blk_sizes,Legion::coord_t in_blk_size, Legion::coord_t num_blks, int - // numOutputs how to get these parameter? - NOT_IMPLEMENTED(); + auto attrs = acc.get_argument(ATTRS); + + coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; + calc_block_size(num_blks, in_blk_size, input_grade.shape, attrs.axis); + for (int i = 0; i < attrs.splits.size(); i++) { + coord_t out_num_blks; + calc_block_size( + out_num_blks, out_blk_size[i], output_grad.shape, split->legion_axis); + } + return profile(backward_kernel, + profiling, + "Split backward_time = %.2lfms\n", + input_grad.get_float_ptr(), + &output_grad.get_float_ptr(), + out_blk_size, + in_blk_size, + num_blks, + attrs.splits.size()); +} + +void calc_block_size(coord_t &num_blks, + coord_t &blk_size, + ArrayShape const &array_shape, + int axis) { + num_blks = 1; + blk_size = 1; + for (int d = 0; d < array_shape.get_dim(); d++) { + if (d <= axis) { + blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); + blk_size *= array_shape.at(legion_dim_t(d)) + 1 + } else { + num_blks *= array_shape.at(legion_dim_t(d)) + 1 + } + } } static void backward_task(Task const *task, @@ -122,7 +143,7 @@ static void backward_task(Task const *task, CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, - InputParallelTensorDes const &input, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { auto env = sim.new_environment(); @@ -146,16 +167,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, return make_metrics(forward_time, backward_time, sync_time, env); } -template <> -void register_task() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT); - // TODO Note: split operator does not need SplitDeviceState, how to register - // the init_task and how to implement the init_task like cast OP? -} - template <> void register_task() { OpTaskSignature fwd(OpTaskType::FWD); From 7820621792d70635541bbaca70063c035f4760e9 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 10 Oct 2023 20:21:34 +0000 Subject: [PATCH 08/10] update the split --- lib/runtime/src/ops/split.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index e371f08a4d..b988186903 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -46,7 +46,7 @@ OpTaskInvocation forward(SplitAttrs const &attrs) { binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_parallel_tensor_shape(0)); + binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); return {SPLIT_FWD_TASK_ID, binding}; From 237017222099f48890dfe15184b4e9405179c1a0 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 17 Oct 2023 09:39:19 -0700 Subject: [PATCH 09/10] Update lib/runtime/src/ops/split.cc --- lib/runtime/src/ops/split.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index b988186903..e8436d2b77 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -59,7 +59,7 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { } static optional forward_task_impl(TaskArgumentAccessor const &acc) { - acc.get_argument(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); From 05b27375e8024d2df49cb23fa7594de326abc63e Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 17 Oct 2023 20:56:50 +0000 Subject: [PATCH 10/10] fix some error --- lib/runtime/src/ops/split.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index b988186903..2af5d42874 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -59,7 +59,6 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { } static optional forward_task_impl(TaskArgumentAccessor const &acc) { - acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -96,7 +95,7 @@ static void forward_task(Task const *task, static optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; @@ -151,7 +150,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding fwd_binding; - fwd_binding.bind(INPUT, input); + fwd_binding.bind(INPUT, input.shape); fwd_binding.bind(OUTPUT, output_shape); fwd_binding.bind_arg(PROFILING, settings);