diff --git a/deps/fmt b/deps/fmt index f5e54359df..a33701196a 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 +Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 diff --git a/lib/runtime/src/ops/split.cc b/lib/runtime/src/ops/split.cc index 195ab7bef4..2af5d42874 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/runtime/src/ops/split.cc @@ -14,7 +14,9 @@ */ #include "split.h" +#include "kernels/array_shape.h" #include "kernels/split_kernels.h" +#include "utils/exceptions.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -37,372 +39,150 @@ using PCG::Node; using namespace FlexFlow::Kernels::Split; -bool operator==(SplitParams const &lhs, SplitParams const &rhs) { - return lhs.splits == rhs.splits && lhs.legion_axis == rhs.legion_axis; -} +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; -bool SplitParams::is_valid(ParallelTensorShape const &input) const { - return input.is_valid(); -} +OpTaskInvocation forward(SplitAttrs const &attrs) { + OpTaskBinding binding; -SplitParams Split::get_params() const { - SplitParams params; - params.splits = this->splits; - params.legion_axis = this->legion_axis; - return params; -} + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(ATTRS, attrs); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); -void FFModel::split(const Tensor input, - Tensor *outputs, - std::vector const &splits, - int axis, - char const *name) { - Layer *split = new Layer(this, - OP_SPLIT, - DT_FLOAT, - name, - 1 /*inputs*/, - 0 /*weights*/, - splits.size() /*outputs*/, - input); - int numdim = input->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i] = input->dims[i]; - } - for (size_t i = 0; i < splits.size(); i++) { - dims[numdim - axis - 1] = splits[i]; - split->outputs[i] = create_tensor_legion_ordering( - numdim, dims, input->data_type, split, 0, true /*create_grad*/); - outputs[i] = split->outputs[i]; - } - split->add_int_property("legion_axis", numdim - axis - 1); - layers.push_back(split); + return {SPLIT_FWD_TASK_ID, binding}; } -Op *Split::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("legion_axis", value); - int legion_axis = value; - std::vector splits; - for (int i = 0; i < layer->numOutputs; i++) { - splits.push_back(layer->outputs[i]->dims[legion_axis]); - } - assert(inputs.size() == 1); - return new Split(model, inputs[0], splits, legion_axis, layer->name); -} +OpTaskInvocation backward(SplitAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); -Split::Split(FFModel &model, - const ParallelTensor input, - std::vector const &splits, - int _legion_axis, - char const *name) - : Op(model, - OP_SPLIT, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - splits.size() /*outputs*/, - input), - legion_axis(_legion_axis), splits(splits) { - numOutputs = splits.size(); - // Note that we use the Legion dim ordering - assert(legion_axis >= 0); - numWeights = 0; - int split_size = 0; - for (int i = 0; i < numOutputs; i++) { - split_size += splits[i]; - int numdim = input->num_dims; - ParallelDim dims[MAX_TENSOR_DIM]; - for (int j = 0; j < numdim; j++) { - dims[j] = input->dims[j]; - } - dims[legion_axis].size = splits[i]; - // Assert the _axis dim cannot be parallelized - assert(dims[legion_axis].degree == 1); - assert(dims[legion_axis].parallel_idx == -1); - outputs[i] = model.create_parallel_tensor_legion_ordering( - numdim, dims, input->data_type, this /*owner_op*/, i /*owner_idx*/); - } - // Check split sizes - assert(split_size == input->dims[legion_axis].size); + return {SPLIT_BWD_TASK_ID, binding}; } -Split::Split(FFModel &model, - SplitParams const ¶ms, - const ParallelTensor input, - char const *name) - : Split(model, input, params.splits, params.legion_axis, name) {} +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto attrs = acc.get_argument(ATTRS); + + coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; + calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis); -void Split::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(SPLIT_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Split)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - for (int i = 0; i < numOutputs; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); - launcher.add_field(i + 1, FID_DATA); + for (int i = 0; i < attrs.splits.size(); i++) { + coord_t out_num_blks; + calc_block_size( + out_num_blks, out_blk_size[i], output.shape, split->legion_axis); } - runtime->execute_index_space(ctx, launcher); + return profile(forward_kernel, + profiling, + "Split forward_time = %.2lfms\n", + &output.get_float_ptr(), + input.get_float_ptr(), + out_blk_size, + in_blk_size, + num_blks, + attrs.splits.size()); } -PerDeviceOpState *Split::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - return NULL; +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); } -void Split::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(SPLIT_FWD_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Split)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - for (int i = 0; i < numOutputs; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[i]->region)); - launcher.add_field(i + 1, FID_DATA); +// maybe we should add assert like the original code +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto attrs = acc.get_argument(ATTRS); + + coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; + calc_block_size(num_blks, in_blk_size, input_grade.shape, attrs.axis); + for (int i = 0; i < attrs.splits.size(); i++) { + coord_t out_num_blks; + calc_block_size( + out_num_blks, out_blk_size[i], output_grad.shape, split->legion_axis); } - runtime->execute_index_space(ctx, launcher); + return profile(backward_kernel, + profiling, + "Split backward_time = %.2lfms\n", + input_grad.get_float_ptr(), + &output_grad.get_float_ptr(), + out_blk_size, + in_blk_size, + num_blks, + attrs.splits.size()); } void calc_block_size(coord_t &num_blks, coord_t &blk_size, - Domain const &domain, + ArrayShape const &array_shape, int axis) { num_blks = 1; blk_size = 1; - for (int d = 0; d < domain.get_dim(); d++) { + for (int d = 0; d < array_shape.get_dim(); d++) { if (d <= axis) { blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); + blk_size *= array_shape.at(legion_dim_t(d)) + 1 } else { - num_blks *= (domain.hi()[d] - domain.lo()[d] + 1); - } - } -} - -void Split::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Split const *split = (Split *)task->args; - assert(regions.size() == split->numOutputs + 1); - assert(task->regions.size() == split->numOutputs + 1); - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - float *out_ptr[MAX_NUM_OUTPUTS]; - size_t total_volume = 0; - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, in_domain, split->legion_axis); - for (int i = 0; i < split->numOutputs; i++) { - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[i + 1].region.get_index_space()); - out_ptr[i] = helperGetTensorPointerWO( - regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); - coord_t out_num_blks; - calc_block_size( - out_num_blks, out_blk_size[i], out_domain, split->legion_axis); - assert(out_num_blks == num_blks); - for (int j = 0; j < out_domain.get_dim(); j++) { - if (j != split->legion_axis) { - assert(out_domain.hi()[j] == in_domain.hi()[j]); - assert(out_domain.lo()[j] == in_domain.lo()[j]); - } + num_blks *= array_shape.at(legion_dim_t(d)) + 1 } - total_volume += out_domain.get_volume(); - } - assert(total_volume == in_domain.get_volume()); - - forward_kernel_wrapper( - out_ptr, in_ptr, out_blk_size, in_blk_size, num_blks, split->numOutputs); -} - -void Split::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - IndexLauncher launcher(SPLIT_BWD_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Split)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - for (int i = 0; i < numOutputs; i++) { - launcher.add_region_requirement(RegionRequirement(outputs[i]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[i]->region_grad)); - launcher.add_field(i + 1, FID_DATA); } - runtime->execute_index_space(ctx, launcher); } -void Split::backward_task(Task const *task, +static void backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Split const *split = (Split *)task->args; - assert(regions.size() == split->numOutputs + 1); - assert(task->regions.size() == split->numOutputs + 1); - Domain in_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - float const *out_grad_ptr[MAX_NUM_OUTPUTS]; - size_t total_volume = 0; - float *in_grad_ptr = helperGetTensorPointerRW( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, in_grad_domain, split->legion_axis); - for (int i = 0; i < split->numOutputs; i++) { - Domain out_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[i + 1].region.get_index_space()); - out_grad_ptr[i] = helperGetTensorPointerRO( - regions[i + 1], task->regions[i + 1], FID_DATA, ctx, runtime); - coord_t out_num_blks; - calc_block_size( - out_num_blks, out_blk_size[i], out_grad_domain, split->legion_axis); - assert(out_num_blks == num_blks); - for (int j = 0; j < out_grad_domain.get_dim(); j++) { - if (j != split->legion_axis) { - assert(out_grad_domain.hi()[j] == in_grad_domain.hi()[j]); - assert(out_grad_domain.lo()[j] == in_grad_domain.lo()[j]); - } - } - total_volume += out_grad_domain.get_volume(); - } - assert(total_volume == in_grad_domain.get_volume()); + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} + +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + SplitAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + auto env = sim.new_environment(); + + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + + SimTaskBinding fwd_binding; + fwd_binding.bind(INPUT, input.shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind_arg(PROFILING, settings); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - backward_kernel_wrapper(in_grad_ptr, - out_grad_ptr, - out_blk_size, - in_blk_size, - num_blks, - split->numOutputs); + auto fwd_accessor = env.get_fwd_accessor(SPLIT_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(SPLIT_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } -bool Split::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output[MAX_NUM_OUTPUTS], sub_input; - for (int i = 0; i < numOutputs; i++) { - if (!outputs[i]->get_sub_tensor(mv, sub_output[i])) { - return false; - } - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - Domain in_domain = sub_input.get_domain(); - sim->free_all(); - float *output_ptr[MAX_NUM_OUTPUTS]; - size_t total_volume = 0; - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, in_domain, legion_axis); - for (int i = 0; i < numOutputs; i++) { - Domain out_domain = sub_output[i].get_domain(); - output_ptr[i] = - (float *)sim->allocate(sub_output[i].get_volume(), DT_FLOAT); - coord_t out_num_blks; - calc_block_size(out_num_blks, out_blk_size[i], out_domain, legion_axis); - assert(out_num_blks == num_blks); - for (int j = 0; j < out_domain.get_dim(); j++) { - if (j != legion_axis) { - assert(out_domain.hi()[j] == in_domain.hi()[j]); - assert(out_domain.lo()[j] == in_domain.lo()[j]); - } - } - total_volume += out_domain.get_volume(); - } - assert(total_volume == in_domain.get_volume()); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper( - output_ptr, input_ptr, out_blk_size, in_blk_size, num_blks, numOutputs); - }; - // Assume backward has the same cost as forward - backward = forward; + fwd.add_arg_slot(PROFILING); - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf) " - "backward_time(%.4lf)\n", - name, - sub_input.get_volume(), - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Split] name(%s) num_elements(%zu) forward_time(%.4lf)\n", - name, - sub_input.get_volume(), - cost_metrics.forward_time); - } - return true; + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task); } -}; // namespace FlexFlow +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); -namespace std { -size_t hash::operator()( - FlexFlow::SplitParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.splits.size()); - for (int n : params.splits) { - hash_combine(key, n); - } - hash_combine(key, params.legion_axis); - return key; + register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task); } -}; // namespace std + +}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/split.h b/lib/runtime/src/ops/split.h index d115c149b8..d63212e836 100644 --- a/lib/runtime/src/ops/split.h +++ b/lib/runtime/src/ops/split.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_SPLIT_H #include "op-attrs/ops/split.h" -#include "op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDes const &input, ProfilingSettings const &settings, MachineView const &machine_view);