From 5ee6acd093b97d6768e8078a3a830725327a783e Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 6 Sep 2023 08:07:04 +0000 Subject: [PATCH 01/16] start to do the reshape operator --- deps/fmt | 2 +- lib/kernels/include/kernels/reshape_kernels.h | 17 +- lib/runtime/src/ops/replicate.h | 4 +- lib/runtime/src/ops/reshape.cc | 850 ++++++++++-------- lib/runtime/src/ops/reshape.h | 4 +- lib/runtime/src/tasks.h | 2 +- 6 files changed, 473 insertions(+), 406 deletions(-) diff --git a/deps/fmt b/deps/fmt index f5e54359df..a33701196a 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 +Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 7cb30254f6..0ce07ae88b 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -1,27 +1,32 @@ #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H +#include "attention_kernels.h" +#include "datatype_dispatch.h" #include "kernels/accessor.h" #include "kernels/device.h" +#include "utils/required_core.h" namespace FlexFlow { -class ReshapePerDeviceState : public PerDeviceOpState { -public: - ReshapePerDeviceState(FFHandler handler); - DataType data_type; +struct ReshapePerDeviceState { + req data_type; }; +FF_VISITABLE_STRUCT_NO_EQ(ReshapePerDeviceState, data_type); + +ReshapePerDeviceState init_kernel(DataType data_type); + namespace Kernels { namespace Reshape { void forward_kernel(ffStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const & meta, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const & meta, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h index fd5ffd9ef9..083998414e 100644 --- a/lib/runtime/src/ops/replicate.h +++ b/lib/runtime/src/ops/replicate.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_REPLICATE_H #include "op-attrs/ops/replicate.h" -#include "op_task_invocation.h" +#include "task_spec/op_task_invocation.h" #include "sim_environment.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(ReplicateAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReplicateAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 71fb10bc9c..45c69d210d 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -46,418 +46,480 @@ bool ReshapeParams::is_valid(ParallelTensorShape const &input) const { return input.is_valid(); } -Tensor FFModel::reshape(const Tensor input, - std::vector const &shape, - char const *name) { - Layer *reshape = new Layer(this, - OP_RESHAPE, - DT_FLOAT, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input); - int dims[MAX_TENSOR_DIM]; - int numdim = shape.size(); - for (int i = 0; i < numdim; i++) { - assert(shape[i] > 0); - dims[i] = shape[i]; - } - reshape->outputs[0] = create_tensor( - numdim, dims, input->data_type, reshape, 0, true /*create_grad*/); - reshape->add_int_vector_property("shape", shape); - layers.push_back(reshape); - return reshape->outputs[0]; -} +enum slots {INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; -Op *Reshape::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - std::vector shape; - layer->get_int_vector_property("shape", shape); - return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); -} -Reshape::Reshape(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor input, - std::vector const &_shape, - char const *name) - : Op(model, - OP_RESHAPE, - input->data_type, - name, - 1 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input) { - layer_guid = _layer_guid; - shape_length = _shape.size(); - assert(shape_length <= MAX_TENSOR_DIM); - for (int i = 0; i < shape_length; i++) { - shape_array[i] = _shape[i]; - } - numOutputs = 1; - numWeights = 0; - int num_replica_dims = 0; - for (int i = 0; i < input->num_dims; i++) { - if (input->dims[i].is_replica_dim) { - num_replica_dims++; - } - } - // assert that all replica dims are leading dims - for (int i = 0; i < num_replica_dims; i++) { - assert(input->dims[input->num_dims - 1 - i].is_replica_dim); - } - int numdim = (int)_shape.size(); - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i].size = _shape[numdim - 1 - i]; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = false; - } - // copy all replica dims - for (int i = 0; i < num_replica_dims; i++) { - dims[i + numdim] = input->dims[input->num_dims - 1 - i]; - } - numdim += num_replica_dims; - for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) { - if (dims[numdim - 1 - i].size != - input->dims[input->num_dims - 1 - i].size) { - break; - } - dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - numdim, dims, input->data_type, this); - assert(outputs[0]->get_volume() == inputs[0]->get_volume()); -} +OpTaskInvocation init(ReshapeAttrs const & attrs) { + OpTaskBinding binding; -Reshape::Reshape(FFModel &model, - ReshapeParams const ¶ms, - const ParallelTensor input, - char const *name) - : Reshape(model, params.layer_guid, input, params.shape, name) {} - -void Reshape::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(RESHAPE_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Reshape)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); -PerDeviceOpState *Reshape::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Reshape const *reshape = (Reshape *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - ReshapeMeta *m = new ReshapeMeta(handle); - m->data_type = reshape->outputs[0]->data_type; - return m; + return {RESHAPE_INIT_TASK_ID, binding}; } -void Reshape::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(RESHAPE_FWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} +OpTaskInvocation forward(ReshapeAttrs const & attrs) { + OpTaskBinding binding; -void Reshape::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Reshape* reshape = (const Reshape*) task->args; - ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(in_domain.get_volume() == out_domain.get_volume()); - - if (m->data_type == DT_FLOAT) { - float const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else if (m->data_type == DT_DOUBLE) { - double const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - double *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else if (m->data_type == DT_INT32) { - int32_t const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int32_t *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else if (m->data_type == DT_INT64) { - int64_t const *in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int64_t *out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); - } else { - assert(false && "Unsupported data type in Reshape forward"); - } -} + binding.bind(PER_DEVICE_STATE, per_device_op_state()); + binding.bind(PROFILING, profiling_settings()); -void Reshape::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(RESHAPE_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[3](I/O): input0_grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + return {RESHAPE_FWD_TASK_ID, binding}; } -ReshapeParams Reshape::get_params() const { - std::vector shape_vec; - for (size_t i = 0; i < shape_length; i++) { - shape_vec.push_back(shape_array[i]); - } - ReshapeParams params; - params.shape = shape_vec; - params.layer_guid = this->layer_guid; - return params; -} +OpTaskInvocation backward(ReshapeAttrs const & attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); -void Reshape::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Reshape* reshape = (const Reshape*) task->args; - ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); - Domain out_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - Domain in_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - assert(in_grad_domain.get_volume() == out_grad_domain.get_volume()); - - if (m->data_type == DT_FLOAT) { - float const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else if (m->data_type == DT_DOUBLE) { - double const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - double *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else if (m->data_type == DT_INT32) { - int32_t const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int32_t *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else if (m->data_type == DT_INT64) { - int64_t const *out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - int64_t *in_grad_ptr = helperGetTensorPointerRW( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper( - in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); - } else { - assert(false && "Unsupported data type in Reshape backward"); - } + return {RESHAPE_BWD_TASK_ID, binding}; } -bool Reshape::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - assert(sub_output.get_volume() == sub_input.get_volume()); - size_t num_elements = sub_input.get_volume(); - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(input_ptr, output_ptr, num_elements); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - float *input_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *output_grad_ptr = - (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - - backward = [&] { - backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf( - "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } - return true; -} -void Reshape::serialize(Legion::Serializer &sez) const { - sez.serialize(this->shape_length); - for (size_t i = 0; i < this->shape_length; i++) { - sez.serialize(this->shape_array[i]); - } - sez.serialize(this->layer_guid.id); -} -using PCG::Node; - -Node Reshape::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - size_t shape_length; - std::vector shape; - dez.deserialize(shape_length); - for (size_t i = 0; i < shape_length; i++) { - int value; - dez.deserialize(value); - shape.push_back(value); - } - size_t id; - dez.deserialize(id); - LayerID layer_guid(id); - - ReshapeParams params; - params.shape = shape; - params.layer_guid = layer_guid; - return ff.get_or_create_node(inputs[0], params); +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT); + + init.add_input_slots(INPUT); + init.add_output_slots(OUTPUT); + + register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); } -Op *Reshape::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - assert(num_inputs == 1); - std::vector shape; - for (size_t i = 0; i < this->shape_length; i++) { - shape.push_back(shape_array[i]); - } - return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name); +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_arg_slot(PROFILING); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + + register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task); } -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::ReshapeParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.shape.size()); - for (int n : params.shape) { - hash_combine(key, n); - } - hash_combine(key, params.layer_guid.id); - return key; +template <> +void register_task() { + OpTaskSignature bwd = infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); + + register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task); } + +// Tensor FFModel::reshape(const Tensor input, +// std::vector const &shape, +// char const *name) { +// Layer *reshape = new Layer(this, +// OP_RESHAPE, +// DT_FLOAT, +// name, +// 1 /*inputs*/, +// 0 /*weights*/, +// 1 /*outputs*/, +// input); +// int dims[MAX_TENSOR_DIM]; +// int numdim = shape.size(); +// for (int i = 0; i < numdim; i++) { +// assert(shape[i] > 0); +// dims[i] = shape[i]; +// } +// reshape->outputs[0] = create_tensor( +// numdim, dims, input->data_type, reshape, 0, true /*create_grad*/); +// reshape->add_int_vector_property("shape", shape); +// layers.push_back(reshape); +// return reshape->outputs[0]; +// } + +// Op *Reshape::create_operator_from_layer( +// FFModel &model, +// Layer const *layer, +// std::vector const &inputs) { +// std::vector shape; +// layer->get_int_vector_property("shape", shape); +// return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); +// } + +// Reshape::Reshape(FFModel &model, +// LayerID const &_layer_guid, +// const ParallelTensor input, +// std::vector const &_shape, +// char const *name) +// : Op(model, +// OP_RESHAPE, +// input->data_type, +// name, +// 1 /*inputs*/, +// 0 /*weights*/, +// 1 /*outputs*/, +// input) { +// layer_guid = _layer_guid; +// shape_length = _shape.size(); +// assert(shape_length <= MAX_TENSOR_DIM); +// for (int i = 0; i < shape_length; i++) { +// shape_array[i] = _shape[i]; +// } +// numOutputs = 1; +// numWeights = 0; +// int num_replica_dims = 0; +// for (int i = 0; i < input->num_dims; i++) { +// if (input->dims[i].is_replica_dim) { +// num_replica_dims++; +// } +// } +// // assert that all replica dims are leading dims +// for (int i = 0; i < num_replica_dims; i++) { +// assert(input->dims[input->num_dims - 1 - i].is_replica_dim); +// } +// int numdim = (int)_shape.size(); +// ParallelDim dims[MAX_TENSOR_DIM]; +// for (int i = 0; i < numdim; i++) { +// dims[i].size = _shape[numdim - 1 - i]; +// dims[i].degree = 1; +// dims[i].parallel_idx = -1; +// dims[i].is_replica_dim = false; +// } +// // copy all replica dims +// for (int i = 0; i < num_replica_dims; i++) { +// dims[i + numdim] = input->dims[input->num_dims - 1 - i]; +// } +// numdim += num_replica_dims; +// for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) { +// if (dims[numdim - 1 - i].size != +// input->dims[input->num_dims - 1 - i].size) { +// break; +// } +// dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; +// } +// outputs[0] = model.create_parallel_tensor_legion_ordering( +// numdim, dims, input->data_type, this); +// assert(outputs[0]->get_volume() == inputs[0]->get_volume()); +// } + +// Reshape::Reshape(FFModel &model, +// ReshapeParams const ¶ms, +// const ParallelTensor input, +// char const *name) +// : Reshape(model, params.layer_guid, input, params.shape, name) {} + +// void Reshape::init(FFModel const &ff) { +// assert(check_output_input_weight_same_parallel_is()); +// parallel_is = outputs[0]->parallel_is; +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_init(ff, argmap); +// IndexLauncher launcher(RESHAPE_INIT_TASK_ID, +// parallel_is, +// TaskArgument(this, sizeof(Reshape)), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[0]->region)); +// launcher.add_field(1, FID_DATA); +// FutureMap fm = runtime->execute_index_space(ctx, launcher); +// fm.wait_all_results(); +// set_opmeta_from_futuremap(ff, fm); +// } + +// PerDeviceOpState *Reshape::init_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// Reshape const *reshape = (Reshape *)task->args; +// FFHandler handle = *((FFHandler const *)task->local_args); +// ReshapeMeta *m = new ReshapeMeta(handle); +// m->data_type = reshape->outputs[0]->data_type; +// return m; +// } + +// void Reshape::forward(FFModel const &ff) { +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_forward(ff, argmap); +// IndexLauncher launcher(RESHAPE_FWD_TASK_ID, +// parallel_is, +// TaskArgument(NULL, 0), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[0]->region)); +// launcher.add_field(1, FID_DATA); +// runtime->execute_index_space(ctx, launcher); +// } + +// void Reshape::forward_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// assert(regions.size() == 2); +// assert(task->regions.size() == 2); +// // const Reshape* reshape = (const Reshape*) task->args; +// ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); +// Domain in_domain = runtime->get_index_space_domain( +// ctx, task->regions[0].region.get_index_space()); +// Domain out_domain = runtime->get_index_space_domain( +// ctx, task->regions[1].region.get_index_space()); +// assert(in_domain.get_volume() == out_domain.get_volume()); + +// if (m->data_type == DT_FLOAT) { +// float const *in_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// float *out_ptr = helperGetTensorPointerWO( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); +// } else if (m->data_type == DT_DOUBLE) { +// double const *in_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// double *out_ptr = helperGetTensorPointerWO( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); +// } else if (m->data_type == DT_INT32) { +// int32_t const *in_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// int32_t *out_ptr = helperGetTensorPointerWO( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); +// } else if (m->data_type == DT_INT64) { +// int64_t const *in_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// int64_t *out_ptr = helperGetTensorPointerWO( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); +// } else { +// assert(false && "Unsupported data type in Reshape forward"); +// } +// } + +// void Reshape::backward(FFModel const &ff) { +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_backward(ff, argmap); +// IndexLauncher launcher(RESHAPE_BWD_TASK_ID, +// parallel_is, +// TaskArgument(NULL, 0), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// // regions[0](I): output_grad +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// outputs[0]->region_grad)); +// launcher.add_field(0, FID_DATA); +// // regions[3](I/O): input0_grad +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, +// 0 /*projection id*/, +// READ_WRITE, +// EXCLUSIVE, +// inputs[0]->region_grad)); +// launcher.add_field(1, FID_DATA); +// runtime->execute_index_space(ctx, launcher); +// } + +// ReshapeParams Reshape::get_params() const { +// std::vector shape_vec; +// for (size_t i = 0; i < shape_length; i++) { +// shape_vec.push_back(shape_array[i]); +// } +// ReshapeParams params; +// params.shape = shape_vec; +// params.layer_guid = this->layer_guid; +// return params; +// } + +// void Reshape::backward_task(Task const *task, +// std::vector const ®ions, +// Context ctx, +// Runtime *runtime) { +// assert(regions.size() == 2); +// assert(task->regions.size() == 2); +// // const Reshape* reshape = (const Reshape*) task->args; +// ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); +// Domain out_grad_domain = runtime->get_index_space_domain( +// ctx, task->regions[0].region.get_index_space()); +// Domain in_grad_domain = runtime->get_index_space_domain( +// ctx, task->regions[1].region.get_index_space()); +// assert(in_grad_domain.get_volume() == out_grad_domain.get_volume()); + +// if (m->data_type == DT_FLOAT) { +// float const *out_grad_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// float *in_grad_ptr = helperGetTensorPointerRW( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// backward_kernel_wrapper( +// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); +// } else if (m->data_type == DT_DOUBLE) { +// double const *out_grad_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// double *in_grad_ptr = helperGetTensorPointerRW( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// backward_kernel_wrapper( +// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); +// } else if (m->data_type == DT_INT32) { +// int32_t const *out_grad_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// int32_t *in_grad_ptr = helperGetTensorPointerRW( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// backward_kernel_wrapper( +// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); +// } else if (m->data_type == DT_INT64) { +// int64_t const *out_grad_ptr = helperGetTensorPointerRO( +// regions[0], task->regions[0], FID_DATA, ctx, runtime); +// int64_t *in_grad_ptr = helperGetTensorPointerRW( +// regions[1], task->regions[1], FID_DATA, ctx, runtime); +// backward_kernel_wrapper( +// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); +// } else { +// assert(false && "Unsupported data type in Reshape backward"); +// } +// } + +// bool Reshape::measure_operator_cost(Simulator *sim, +// MachineView const &mv, +// CostMetrics &cost_metrics) const { +// ParallelTensorBase sub_input, sub_output; +// if (!outputs[0]->get_sub_tensor(mv, sub_output)) { +// return false; +// } +// if (!inputs[0]->get_sub_tensor(mv, sub_input)) { +// return false; +// } + +// sim->free_all(); +// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); +// assert(input_ptr != NULL); +// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + +// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); +// assert(output_ptr != NULL); +// cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + +// assert(sub_output.get_volume() == sub_input.get_volume()); +// size_t num_elements = sub_input.get_volume(); + +// std::function forward, backward; +// forward = [&] { +// forward_kernel_wrapper(input_ptr, output_ptr, num_elements); +// }; +// if (sim->computationMode == COMP_MODE_TRAINING) { +// float *input_grad_ptr = +// (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); +// assert(input_grad_ptr != NULL); +// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + +// float *output_grad_ptr = +// (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); +// assert(output_grad_ptr != NULL); +// cost_metrics.outputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); + +// backward = [&] { +// backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements); +// }; +// } + +// inner_measure_operator_cost(sim, forward, backward, cost_metrics); + +// if (sim->computationMode == COMP_MODE_TRAINING) { +// printf( +// "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n", +// name, +// cost_metrics.forward_time, +// cost_metrics.backward_time); +// } else { +// printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n", +// name, +// cost_metrics.forward_time); +// } +// return true; +// } + +// void Reshape::serialize(Legion::Serializer &sez) const { +// sez.serialize(this->shape_length); +// for (size_t i = 0; i < this->shape_length; i++) { +// sez.serialize(this->shape_array[i]); +// } +// sez.serialize(this->layer_guid.id); +// } + +// using PCG::Node; + +// Node Reshape::deserialize(FFModel &ff, +// Legion::Deserializer &dez, +// ParallelTensor inputs[], +// int num_inputs) { +// assert(num_inputs == 1); +// size_t shape_length; +// std::vector shape; +// dez.deserialize(shape_length); +// for (size_t i = 0; i < shape_length; i++) { +// int value; +// dez.deserialize(value); +// shape.push_back(value); +// } +// size_t id; +// dez.deserialize(id); +// LayerID layer_guid(id); + +// ReshapeParams params; +// params.shape = shape; +// params.layer_guid = layer_guid; +// return ff.get_or_create_node(inputs[0], params); +// } + +// Op *Reshape::materialize(FFModel &ff, +// ParallelTensor inputs[], +// int num_inputs) const { +// assert(num_inputs == 1); +// std::vector shape; +// for (size_t i = 0; i < this->shape_length; i++) { +// shape.push_back(shape_array[i]); +// } +// return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name); +// } + +// }; // namespace FlexFlow + +// namespace std { +// size_t hash::operator()( +// FlexFlow::ReshapeParams const ¶ms) const { +// size_t key = 0; +// hash_combine(key, params.shape.size()); +// for (int n : params.shape) { +// hash_combine(key, n); +// } +// hash_combine(key, params.layer_guid.id); +// return key; +// } + }; // namespace std diff --git a/lib/runtime/src/ops/reshape.h b/lib/runtime/src/ops/reshape.h index 42bbefd9db..85370d9ede 100644 --- a/lib/runtime/src/ops/reshape.h +++ b/lib/runtime/src/ops/reshape.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_RESHAPE_H #include "op-attrs/ops/reshape.h" -#include "op_task_invocation.h" +#include "task_spec/op_task_invocation.h" #include "sim_environment.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(ReshapeAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReshapeAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); diff --git a/lib/runtime/src/tasks.h b/lib/runtime/src/tasks.h index 5c1dcb0698..5d36502f6f 100644 --- a/lib/runtime/src/tasks.h +++ b/lib/runtime/src/tasks.h @@ -73,7 +73,7 @@ enum task_id_t { REDUCE_INIT_TASK_ID, REDUCE_FWD_TASK_ID, REDUCE_BWD_TASK_ID, - RESHAPE_INIT_TASK_ID, + _INIT_TASK_IDRESHAPE, RESHAPE_FWD_TASK_ID, RESHAPE_BWD_TASK_ID, REVERSE_INIT_TASK_ID, From 0eaee690703ec2fcb0fa3526ec080abdf4c5ebd2 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 6 Sep 2023 08:17:12 +0000 Subject: [PATCH 02/16] add init_task,forward_task, forward_task for reshape --- lib/runtime/src/ops/reshape.cc | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 45c69d210d..82b8ae2adc 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -16,6 +16,7 @@ #include "reshape.h" #include "kernels/reshape_kernels.h" #include "legion/legion_utilities.h" +#include "utils/exception.decl.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -75,7 +76,51 @@ OpTaskInvocation backward(ReshapeAttrs const & attrs) { return {RESHAPE_BWD_TASK_ID, binding}; } +static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { + NOT_IMPLEMENTED(); +} + +static DeviceSpecific init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + return init_task_impl(acc); +} + +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + NOT_IMPLEMENTED(); +} + +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); +} + +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + NOT_IMPLEMENTED(); +} + +static void backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} + + +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + ReshapeAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + NOT_IMPLEMENTED(); + } template <> void register_task() { From 15ed89cccb5d0ad9aea4bc87eff5a14c6b1d4ef1 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 6 Sep 2023 08:50:48 +0000 Subject: [PATCH 03/16] reshape version0.1 done --- lib/kernels/include/kernels/reshape_kernels.h | 6 +- lib/runtime/src/ops/replicate.h | 2 +- lib/runtime/src/ops/reshape.cc | 96 ++++++++++++------- lib/runtime/src/ops/reshape.h | 2 +- 4 files changed, 69 insertions(+), 37 deletions(-) diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 0ce07ae88b..ad3451e7ef 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -9,7 +9,7 @@ namespace FlexFlow { -struct ReshapePerDeviceState { +struct ReshapePerDeviceState { req data_type; }; @@ -21,12 +21,12 @@ namespace Kernels { namespace Reshape { void forward_kernel(ffStream_t stream, - ReshapePerDeviceState const & meta, + ReshapePerDeviceState const &meta, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - ReshapePerDeviceState const & meta, + ReshapePerDeviceState const &meta, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h index 083998414e..9880f0991b 100644 --- a/lib/runtime/src/ops/replicate.h +++ b/lib/runtime/src/ops/replicate.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_REPLICATE_H #include "op-attrs/ops/replicate.h" -#include "task_spec/op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 82b8ae2adc..bdb4cbaf8b 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -47,10 +47,9 @@ bool ReshapeParams::is_valid(ParallelTensorShape const &input) const { return input.is_valid(); } -enum slots {INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; +enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; - -OpTaskInvocation init(ReshapeAttrs const & attrs) { +OpTaskInvocation init(ReshapeAttrs const &attrs) { OpTaskBinding binding; binding.bind(INPUT, input_tensor(0)); @@ -59,7 +58,7 @@ OpTaskInvocation init(ReshapeAttrs const & attrs) { return {RESHAPE_INIT_TASK_ID, binding}; } -OpTaskInvocation forward(ReshapeAttrs const & attrs) { +OpTaskInvocation forward(ReshapeAttrs const &attrs) { OpTaskBinding binding; binding.bind(PER_DEVICE_STATE, per_device_op_state()); @@ -70,17 +69,19 @@ OpTaskInvocation forward(ReshapeAttrs const & attrs) { return {RESHAPE_FWD_TASK_ID, binding}; } -OpTaskInvocation backward(ReshapeAttrs const & attrs) { +OpTaskInvocation backward(ReshapeAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); return {RESHAPE_BWD_TASK_ID, binding}; } -static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { NOT_IMPLEMENTED(); } -static DeviceSpecific init_task(Task const *task, +static DeviceSpecific + init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { @@ -89,7 +90,19 @@ static DeviceSpecific init_task(Task const *task, } static optional forward_task_impl(TaskArgumentAccessor const &acc) { - NOT_IMPLEMENTED(); + auto per_device_state = + acc.get_argument>(PER_DEVICE_STATE); + Profiling profiling = acc.get_argument(PROFILING); + + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Reshape] forward time = %.2lfms\n", + &per_device_state, + input, + output); } static void forward_task(Task const *task, @@ -101,7 +114,19 @@ static void forward_task(Task const *task, } static optional backward_task_impl(TaskArgumentAccessor const &acc) { - NOT_IMPLEMENTED(); + auto per_device_state = + acc.get_argument>(PER_DEVICE_STATE); + Profiling profiling = acc.get_argument(PROFILING); + + auto input_grad = acc.get_tensor(INPUT); + auto output_grad = acc.get_tensor(OUTPUT); + + return profile(backward_kernel, + profiling, + "[Reshape] backward time = %.2lfms\n", + &per_device_state, + input_grad, + output_grad); } static void backward_task(Task const *task, @@ -112,24 +137,29 @@ static void backward_task(Task const *task, backward_task_impl(acc); } - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReshapeAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - NOT_IMPLEMENTED(); - } + // reshape has no cost + // Note(lamda):if reshape has cost, we can optimize this implementation + + float forward_time = 0.0; + float backward_time = 0.0; + float sync_time = 0.0; + return make_metrics(forward_time, backward_time, sync_time, env); +} template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init(OpTaskType::INIT); - init.add_input_slots(INPUT); - init.add_output_slots(OUTPUT); + init.add_input_slots(INPUT); + init.add_output_slots(OUTPUT); - register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); + register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); } template <> @@ -147,7 +177,8 @@ void register_task() { template <> void register_task() { - OpTaskSignature bwd = infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); + OpTaskSignature bwd = + infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task); } @@ -182,7 +213,8 @@ void register_task() { // std::vector const &inputs) { // std::vector shape; // layer->get_int_vector_property("shape", shape); -// return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); +// return new Reshape(model, layer->layer_guid, inputs[0], shape, +// layer->name); // } // Reshape::Reshape(FFModel &model, @@ -280,9 +312,9 @@ void register_task() { // } // PerDeviceOpState *Reshape::init_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { +// std::vector const +// ®ions, Context ctx, Runtime *runtime) +// { // Reshape const *reshape = (Reshape *)task->args; // FFHandler handle = *((FFHandler const *)task->local_args); // ReshapeMeta *m = new ReshapeMeta(handle); @@ -461,13 +493,13 @@ void register_task() { // } // sim->free_all(); -// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); -// assert(input_ptr != NULL); -// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), +// DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); -// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); -// assert(output_ptr != NULL); -// cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), +// DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); // assert(sub_output.get_volume() == sub_input.get_volume()); // size_t num_elements = sub_input.get_volume(); @@ -480,7 +512,8 @@ void register_task() { // float *input_grad_ptr = // (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); // assert(input_grad_ptr != NULL); -// cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); +// cost_metrics.inputs_memory += +// cost_metrics.total_mem_diff_from(sim->offset); // float *output_grad_ptr = // (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); @@ -497,9 +530,8 @@ void register_task() { // if (sim->computationMode == COMP_MODE_TRAINING) { // printf( -// "[Measure Reshape] name(%s) forward_time(%.4lf) backward_time(%.4lf)\n", -// name, -// cost_metrics.forward_time, +// "[Measure Reshape] name(%s) forward_time(%.4lf) +// backward_time(%.4lf)\n", name, cost_metrics.forward_time, // cost_metrics.backward_time); // } else { // printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n", @@ -567,4 +599,4 @@ void register_task() { // return key; // } -}; // namespace std +}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reshape.h b/lib/runtime/src/ops/reshape.h index 85370d9ede..f044e3f057 100644 --- a/lib/runtime/src/ops/reshape.h +++ b/lib/runtime/src/ops/reshape.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RESHAPE_H #include "op-attrs/ops/reshape.h" -#include "task_spec/op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { From 984d6168c9b3a3e9b7a8ee8ade64f2a09cb9790d Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 15:11:02 +0000 Subject: [PATCH 04/16] update the reshape and leave the init_task and measure_operator_task --- lib/kernels/include/kernels/reshape_kernels.h | 11 +- lib/runtime/src/ops/replicate.h | 2 +- lib/runtime/src/ops/reshape.cc | 438 +----------------- lib/runtime/src/tasks.h | 2 +- 4 files changed, 18 insertions(+), 435 deletions(-) diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index ad3451e7ef..3e6fd46a51 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -1,8 +1,6 @@ #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H -#include "attention_kernels.h" -#include "datatype_dispatch.h" #include "kernels/accessor.h" #include "kernels/device.h" #include "utils/required_core.h" @@ -13,20 +11,21 @@ struct ReshapePerDeviceState { req data_type; }; -FF_VISITABLE_STRUCT_NO_EQ(ReshapePerDeviceState, data_type); +FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -ReshapePerDeviceState init_kernel(DataType data_type); namespace Kernels { namespace Reshape { +ReshapePerDeviceState init_kernel(DataType data_type); + void forward_kernel(ffStream_t stream, - ReshapePerDeviceState const &meta, + ReshapePerDeviceState const &per_device_state, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - ReshapePerDeviceState const &meta, + ReshapePerDeviceState const $per_device_state, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h index 9880f0991b..361e107b1b 100644 --- a/lib/runtime/src/ops/replicate.h +++ b/lib/runtime/src/ops/replicate.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_REPLICATE_H #include "op-attrs/ops/replicate.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index bdb4cbaf8b..f8a8f69de9 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -16,7 +16,7 @@ #include "reshape.h" #include "kernels/reshape_kernels.h" #include "legion/legion_utilities.h" -#include "utils/exception.decl.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -52,7 +52,7 @@ enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; OpTaskInvocation init(ReshapeAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); + binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); return {RESHAPE_INIT_TASK_ID, binding}; @@ -61,10 +61,10 @@ OpTaskInvocation init(ReshapeAttrs const &attrs) { OpTaskInvocation forward(ReshapeAttrs const &attrs) { OpTaskBinding binding; - binding.bind(PER_DEVICE_STATE, per_device_op_state()); - binding.bind(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); + binding.bind_arg(PROFILING, profiling_settings()); - binding.bind(INPUT, input_tensor(0)); + binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); return {RESHAPE_FWD_TASK_ID, binding}; } @@ -91,7 +91,7 @@ static DeviceSpecific static optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = - acc.get_argument>(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); Profiling profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -100,7 +100,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, "[Reshape] forward time = %.2lfms\n", - &per_device_state, + per_device_state, input, output); } @@ -124,7 +124,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, "[Reshape] backward time = %.2lfms\n", - &per_device_state, + per_device_state, input_grad, output_grad); } @@ -156,8 +156,8 @@ template <> void register_task() { OpTaskSignature init(OpTaskType::INIT); - init.add_input_slots(INPUT); - init.add_output_slots(OUTPUT); + init.add_input_slot(INPUT); + init.add_output_slot(OUTPUT); register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); } @@ -166,7 +166,7 @@ template <> void register_task() { OpTaskSignature fwd(OpTaskType::FWD); - fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_input_slot(INPUT); @@ -183,420 +183,4 @@ void register_task() { register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task); } -// Tensor FFModel::reshape(const Tensor input, -// std::vector const &shape, -// char const *name) { -// Layer *reshape = new Layer(this, -// OP_RESHAPE, -// DT_FLOAT, -// name, -// 1 /*inputs*/, -// 0 /*weights*/, -// 1 /*outputs*/, -// input); -// int dims[MAX_TENSOR_DIM]; -// int numdim = shape.size(); -// for (int i = 0; i < numdim; i++) { -// assert(shape[i] > 0); -// dims[i] = shape[i]; -// } -// reshape->outputs[0] = create_tensor( -// numdim, dims, input->data_type, reshape, 0, true /*create_grad*/); -// reshape->add_int_vector_property("shape", shape); -// layers.push_back(reshape); -// return reshape->outputs[0]; -// } - -// Op *Reshape::create_operator_from_layer( -// FFModel &model, -// Layer const *layer, -// std::vector const &inputs) { -// std::vector shape; -// layer->get_int_vector_property("shape", shape); -// return new Reshape(model, layer->layer_guid, inputs[0], shape, -// layer->name); -// } - -// Reshape::Reshape(FFModel &model, -// LayerID const &_layer_guid, -// const ParallelTensor input, -// std::vector const &_shape, -// char const *name) -// : Op(model, -// OP_RESHAPE, -// input->data_type, -// name, -// 1 /*inputs*/, -// 0 /*weights*/, -// 1 /*outputs*/, -// input) { -// layer_guid = _layer_guid; -// shape_length = _shape.size(); -// assert(shape_length <= MAX_TENSOR_DIM); -// for (int i = 0; i < shape_length; i++) { -// shape_array[i] = _shape[i]; -// } -// numOutputs = 1; -// numWeights = 0; -// int num_replica_dims = 0; -// for (int i = 0; i < input->num_dims; i++) { -// if (input->dims[i].is_replica_dim) { -// num_replica_dims++; -// } -// } -// // assert that all replica dims are leading dims -// for (int i = 0; i < num_replica_dims; i++) { -// assert(input->dims[input->num_dims - 1 - i].is_replica_dim); -// } -// int numdim = (int)_shape.size(); -// ParallelDim dims[MAX_TENSOR_DIM]; -// for (int i = 0; i < numdim; i++) { -// dims[i].size = _shape[numdim - 1 - i]; -// dims[i].degree = 1; -// dims[i].parallel_idx = -1; -// dims[i].is_replica_dim = false; -// } -// // copy all replica dims -// for (int i = 0; i < num_replica_dims; i++) { -// dims[i + numdim] = input->dims[input->num_dims - 1 - i]; -// } -// numdim += num_replica_dims; -// for (int i = num_replica_dims; i < numdim && i < input->num_dims; i++) { -// if (dims[numdim - 1 - i].size != -// input->dims[input->num_dims - 1 - i].size) { -// break; -// } -// dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; -// } -// outputs[0] = model.create_parallel_tensor_legion_ordering( -// numdim, dims, input->data_type, this); -// assert(outputs[0]->get_volume() == inputs[0]->get_volume()); -// } - -// Reshape::Reshape(FFModel &model, -// ReshapeParams const ¶ms, -// const ParallelTensor input, -// char const *name) -// : Reshape(model, params.layer_guid, input, params.shape, name) {} - -// void Reshape::init(FFModel const &ff) { -// assert(check_output_input_weight_same_parallel_is()); -// parallel_is = outputs[0]->parallel_is; -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_init(ff, argmap); -// IndexLauncher launcher(RESHAPE_INIT_TASK_ID, -// parallel_is, -// TaskArgument(this, sizeof(Reshape)), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[0]->region)); -// launcher.add_field(1, FID_DATA); -// FutureMap fm = runtime->execute_index_space(ctx, launcher); -// fm.wait_all_results(); -// set_opmeta_from_futuremap(ff, fm); -// } - -// PerDeviceOpState *Reshape::init_task(Task const *task, -// std::vector const -// ®ions, Context ctx, Runtime *runtime) -// { -// Reshape const *reshape = (Reshape *)task->args; -// FFHandler handle = *((FFHandler const *)task->local_args); -// ReshapeMeta *m = new ReshapeMeta(handle); -// m->data_type = reshape->outputs[0]->data_type; -// return m; -// } - -// void Reshape::forward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_forward(ff, argmap); -// IndexLauncher launcher(RESHAPE_FWD_TASK_ID, -// parallel_is, -// TaskArgument(NULL, 0), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[0]->region)); -// launcher.add_field(1, FID_DATA); -// runtime->execute_index_space(ctx, launcher); -// } - -// void Reshape::forward_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { -// assert(regions.size() == 2); -// assert(task->regions.size() == 2); -// // const Reshape* reshape = (const Reshape*) task->args; -// ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); -// Domain in_domain = runtime->get_index_space_domain( -// ctx, task->regions[0].region.get_index_space()); -// Domain out_domain = runtime->get_index_space_domain( -// ctx, task->regions[1].region.get_index_space()); -// assert(in_domain.get_volume() == out_domain.get_volume()); - -// if (m->data_type == DT_FLOAT) { -// float const *in_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// float *out_ptr = helperGetTensorPointerWO( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); -// } else if (m->data_type == DT_DOUBLE) { -// double const *in_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// double *out_ptr = helperGetTensorPointerWO( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); -// } else if (m->data_type == DT_INT32) { -// int32_t const *in_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// int32_t *out_ptr = helperGetTensorPointerWO( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); -// } else if (m->data_type == DT_INT64) { -// int64_t const *in_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// int64_t *out_ptr = helperGetTensorPointerWO( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// forward_kernel_wrapper(in_ptr, out_ptr, in_domain.get_volume()); -// } else { -// assert(false && "Unsupported data type in Reshape forward"); -// } -// } - -// void Reshape::backward(FFModel const &ff) { -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_backward(ff, argmap); -// IndexLauncher launcher(RESHAPE_BWD_TASK_ID, -// parallel_is, -// TaskArgument(NULL, 0), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// // regions[0](I): output_grad -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// outputs[0]->region_grad)); -// launcher.add_field(0, FID_DATA); -// // regions[3](I/O): input0_grad -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, -// 0 /*projection id*/, -// READ_WRITE, -// EXCLUSIVE, -// inputs[0]->region_grad)); -// launcher.add_field(1, FID_DATA); -// runtime->execute_index_space(ctx, launcher); -// } - -// ReshapeParams Reshape::get_params() const { -// std::vector shape_vec; -// for (size_t i = 0; i < shape_length; i++) { -// shape_vec.push_back(shape_array[i]); -// } -// ReshapeParams params; -// params.shape = shape_vec; -// params.layer_guid = this->layer_guid; -// return params; -// } - -// void Reshape::backward_task(Task const *task, -// std::vector const ®ions, -// Context ctx, -// Runtime *runtime) { -// assert(regions.size() == 2); -// assert(task->regions.size() == 2); -// // const Reshape* reshape = (const Reshape*) task->args; -// ReshapeMeta const *m = *((ReshapeMeta **)task->local_args); -// Domain out_grad_domain = runtime->get_index_space_domain( -// ctx, task->regions[0].region.get_index_space()); -// Domain in_grad_domain = runtime->get_index_space_domain( -// ctx, task->regions[1].region.get_index_space()); -// assert(in_grad_domain.get_volume() == out_grad_domain.get_volume()); - -// if (m->data_type == DT_FLOAT) { -// float const *out_grad_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// float *in_grad_ptr = helperGetTensorPointerRW( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// backward_kernel_wrapper( -// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); -// } else if (m->data_type == DT_DOUBLE) { -// double const *out_grad_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// double *in_grad_ptr = helperGetTensorPointerRW( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// backward_kernel_wrapper( -// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); -// } else if (m->data_type == DT_INT32) { -// int32_t const *out_grad_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// int32_t *in_grad_ptr = helperGetTensorPointerRW( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// backward_kernel_wrapper( -// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); -// } else if (m->data_type == DT_INT64) { -// int64_t const *out_grad_ptr = helperGetTensorPointerRO( -// regions[0], task->regions[0], FID_DATA, ctx, runtime); -// int64_t *in_grad_ptr = helperGetTensorPointerRW( -// regions[1], task->regions[1], FID_DATA, ctx, runtime); -// backward_kernel_wrapper( -// in_grad_ptr, out_grad_ptr, in_grad_domain.get_volume()); -// } else { -// assert(false && "Unsupported data type in Reshape backward"); -// } -// } - -// bool Reshape::measure_operator_cost(Simulator *sim, -// MachineView const &mv, -// CostMetrics &cost_metrics) const { -// ParallelTensorBase sub_input, sub_output; -// if (!outputs[0]->get_sub_tensor(mv, sub_output)) { -// return false; -// } -// if (!inputs[0]->get_sub_tensor(mv, sub_input)) { -// return false; -// } - -// sim->free_all(); -// float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), -// DT_FLOAT); assert(input_ptr != NULL); cost_metrics.inputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); - -// float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), -// DT_FLOAT); assert(output_ptr != NULL); cost_metrics.outputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); - -// assert(sub_output.get_volume() == sub_input.get_volume()); -// size_t num_elements = sub_input.get_volume(); - -// std::function forward, backward; -// forward = [&] { -// forward_kernel_wrapper(input_ptr, output_ptr, num_elements); -// }; -// if (sim->computationMode == COMP_MODE_TRAINING) { -// float *input_grad_ptr = -// (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); -// assert(input_grad_ptr != NULL); -// cost_metrics.inputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); - -// float *output_grad_ptr = -// (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); -// assert(output_grad_ptr != NULL); -// cost_metrics.outputs_memory += -// cost_metrics.total_mem_diff_from(sim->offset); - -// backward = [&] { -// backward_kernel_wrapper(input_grad_ptr, output_grad_ptr, num_elements); -// }; -// } - -// inner_measure_operator_cost(sim, forward, backward, cost_metrics); - -// if (sim->computationMode == COMP_MODE_TRAINING) { -// printf( -// "[Measure Reshape] name(%s) forward_time(%.4lf) -// backward_time(%.4lf)\n", name, cost_metrics.forward_time, -// cost_metrics.backward_time); -// } else { -// printf("[Measure Reshape] name(%s) forward_time(%.4lf)\n", -// name, -// cost_metrics.forward_time); -// } -// return true; -// } - -// void Reshape::serialize(Legion::Serializer &sez) const { -// sez.serialize(this->shape_length); -// for (size_t i = 0; i < this->shape_length; i++) { -// sez.serialize(this->shape_array[i]); -// } -// sez.serialize(this->layer_guid.id); -// } - -// using PCG::Node; - -// Node Reshape::deserialize(FFModel &ff, -// Legion::Deserializer &dez, -// ParallelTensor inputs[], -// int num_inputs) { -// assert(num_inputs == 1); -// size_t shape_length; -// std::vector shape; -// dez.deserialize(shape_length); -// for (size_t i = 0; i < shape_length; i++) { -// int value; -// dez.deserialize(value); -// shape.push_back(value); -// } -// size_t id; -// dez.deserialize(id); -// LayerID layer_guid(id); - -// ReshapeParams params; -// params.shape = shape; -// params.layer_guid = layer_guid; -// return ff.get_or_create_node(inputs[0], params); -// } - -// Op *Reshape::materialize(FFModel &ff, -// ParallelTensor inputs[], -// int num_inputs) const { -// assert(num_inputs == 1); -// std::vector shape; -// for (size_t i = 0; i < this->shape_length; i++) { -// shape.push_back(shape_array[i]); -// } -// return new Reshape(ff, this->layer_guid, inputs[0], shape, this->name); -// } - -// }; // namespace FlexFlow - -// namespace std { -// size_t hash::operator()( -// FlexFlow::ReshapeParams const ¶ms) const { -// size_t key = 0; -// hash_combine(key, params.shape.size()); -// for (int n : params.shape) { -// hash_combine(key, n); -// } -// hash_combine(key, params.layer_guid.id); -// return key; -// } - }; // namespace FlexFlow diff --git a/lib/runtime/src/tasks.h b/lib/runtime/src/tasks.h index 5d36502f6f..5c1dcb0698 100644 --- a/lib/runtime/src/tasks.h +++ b/lib/runtime/src/tasks.h @@ -73,7 +73,7 @@ enum task_id_t { REDUCE_INIT_TASK_ID, REDUCE_FWD_TASK_ID, REDUCE_BWD_TASK_ID, - _INIT_TASK_IDRESHAPE, + RESHAPE_INIT_TASK_ID, RESHAPE_FWD_TASK_ID, RESHAPE_BWD_TASK_ID, REVERSE_INIT_TASK_ID, From 8b0a0270dfd9e7b0404c6e5c022a74d17efdac1c Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 19:25:52 +0000 Subject: [PATCH 05/16] refine the reshape pr --- lib/kernels/include/kernels/reshape_kernels.h | 1 - lib/runtime/src/ops/reshape.cc | 38 +++++++++++++++---- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 3e6fd46a51..57296fb4aa 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -13,7 +13,6 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); - namespace Kernels { namespace Reshape { diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index f8a8f69de9..d2b6028f57 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -54,6 +54,7 @@ OpTaskInvocation init(ReshapeAttrs const &attrs) { binding.bind(INPUT, input_parallel_tensor_shape(0)); binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(ATTRS, attrs); return {RESHAPE_INIT_TASK_ID, binding}; } @@ -61,7 +62,8 @@ OpTaskInvocation init(ReshapeAttrs const &attrs) { OpTaskInvocation forward(ReshapeAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); binding.bind(INPUT, input_parallel_tensor_shape(0)); @@ -77,7 +79,12 @@ OpTaskInvocation backward(ReshapeAttrs const &attrs) { static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { - NOT_IMPLEMENTED(); + auto attrs = acc.get_argument(ATTRS); + + DeviceSpecific per_device_state = + acc.create_device_specific( + init_kernel(attrs.shape.data_type)); + return per_device_state; } static DeviceSpecific @@ -91,7 +98,7 @@ static DeviceSpecific static optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); Profiling profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -143,12 +150,29 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ProfilingSettings const &settings, MachineView const &machine_view) { - // reshape has no cost // Note(lamda):if reshape has cost, we can optimize this implementation - float forward_time = 0.0; - float backward_time = 0.0; - float sync_time = 0.0; + SimTaskBinding init_binding; + init_binding.bind_arg(ATTRS, attrs); + auto init_accessor = + env.get_init_accessor(RESHAPE_INIT_TASK_ID, init_binding); + auto per_device_state = init_task_impl(init_accessor); + + SimTaskBinding fwd_binding; + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind(INPUT, input_parallel_tensor_shape(0)); + fwd_binding.bind(OUTPUT, output_tensor(0)); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(TOPK_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(TOPK_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); return make_metrics(forward_time, backward_time, sync_time, env); } From 3047c75c384f00a1bc9e7069789cafb496701be8 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 19:27:30 +0000 Subject: [PATCH 06/16] remove the replicate --- lib/runtime/src/ops/replicate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/replicate.h b/lib/runtime/src/ops/replicate.h index 361e107b1b..fd5ffd9ef9 100644 --- a/lib/runtime/src/ops/replicate.h +++ b/lib/runtime/src/ops/replicate.h @@ -20,7 +20,7 @@ OpTaskInvocation backward(ReplicateAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReplicateAttrs const &attrs, - InputParallelTensorDesc const &input, + ParallelTensorShape const &input_shape, ProfilingSettings const &settings, MachineView const &machine_view); From ea161cd63f41761e262ba1a294b7a58b1e2a53dc Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Sun, 24 Sep 2023 21:10:41 +0000 Subject: [PATCH 07/16] fix error in reshape --- lib/runtime/src/ops/reshape.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index d2b6028f57..1c0e4477ef 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -81,8 +81,8 @@ static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); - DeviceSpecific per_device_state = - acc.create_device_specific( + DeviceSpecific per_device_state = + acc.create_device_specific( init_kernel(attrs.shape.data_type)); return per_device_state; } @@ -122,7 +122,7 @@ static void forward_task(Task const *task, static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = - acc.get_argument>(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); Profiling profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor(INPUT); From 63cc83dd409ef929010c2161dacf29071c5d72d7 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 26 Sep 2023 13:46:17 +0000 Subject: [PATCH 08/16] fix the type error --- lib/kernels/include/kernels/reshape_kernels.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 57296fb4aa..6be4012073 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -24,7 +24,7 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - ReshapePerDeviceState const $per_device_state, + ReshapePerDeviceState const &per_device_state, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output); From 59166071d31707e62664af61d6bca1a588c168f3 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 27 Sep 2023 20:04:17 +0000 Subject: [PATCH 09/16] add init_kernel for reshape --- lib/kernels/src/cuda/reshape_kernels.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu index 15ae0c7109..554ab9fa6b 100644 --- a/lib/kernels/src/cuda/reshape_kernels.cu +++ b/lib/kernels/src/cuda/reshape_kernels.cu @@ -19,8 +19,9 @@ namespace FlexFlow { -ReshapePerDeviceState::ReshapePerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) {} +ReshapePerDeviceState init_kernel(DataType data_type) { + return ReshapePerDeviceState{data_type}; +} namespace Kernels { namespace Reshape { From 171f05fbf037f272d06678b773a4694bc0062c8d Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 3 Oct 2023 15:08:46 +0000 Subject: [PATCH 10/16] modify the reshpae by fixing the typo error --- lib/runtime/src/ops/reshape.cc | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 1c0e4477ef..3e57da3c97 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -38,22 +38,11 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reshape; -/* Params */ -bool operator==(ReshapeParams const &lhs, ReshapeParams const &rhs) { - return lhs.shape == rhs.shape; -} - -bool ReshapeParams::is_valid(ParallelTensorShape const &input) const { - return input.is_valid(); -} - enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; OpTaskInvocation init(ReshapeAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_parallel_tensor_shape(0)); - binding.bind(OUTPUT, output_tensor(0)); binding.bind_arg(ATTRS, attrs); return {RESHAPE_INIT_TASK_ID, binding}; @@ -150,8 +139,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ProfilingSettings const &settings, MachineView const &machine_view) { - // Note(lamda):if reshape has cost, we can optimize this implementation - SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); auto init_accessor = @@ -166,8 +153,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_fwd_accessor(TOPK_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_bwd_accessor(TOPK_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = env.get_fwd_accessor(RESHAPE_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(RESHAPE_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); From 143d496367d8a81c3167773ab00b3a5ce1f3a23a Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 10 Oct 2023 20:08:21 +0000 Subject: [PATCH 11/16] refine the reshape --- lib/runtime/src/ops/reshape.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 3e57da3c97..0cbb919529 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -55,7 +55,7 @@ OpTaskInvocation forward(ReshapeAttrs const &attrs) { per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); - binding.bind(INPUT, input_parallel_tensor_shape(0)); + binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); return {RESHAPE_FWD_TASK_ID, binding}; } @@ -146,10 +146,11 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, auto per_device_state = init_task_impl(init_accessor); SimTaskBinding fwd_binding; + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); fwd_binding.bind_arg(PROFILING, settings); - fwd_binding.bind(INPUT, input_parallel_tensor_shape(0)); - fwd_binding.bind(OUTPUT, output_tensor(0)); + fwd_binding.bind(INPUT, input.shape); + fwd_binding.bind(OUTPUT, output_shape); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); @@ -167,8 +168,9 @@ template <> void register_task() { OpTaskSignature init(OpTaskType::INIT); - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT); + init.add_arg_slot(ATTRS); + + init.add_return_value(PER_DEVICE_STATE); register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); } From 2f798d59ff66a7ea3f4661ea5f9d8e114ca9a0bf Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 17 Oct 2023 21:13:06 +0000 Subject: [PATCH 12/16] reshape --- lib/kernels/src/cuda/reshape_kernels.cu | 4 ++-- lib/runtime/src/ops/reshape.cc | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu index 554ab9fa6b..941d431a7c 100644 --- a/lib/kernels/src/cuda/reshape_kernels.cu +++ b/lib/kernels/src/cuda/reshape_kernels.cu @@ -55,14 +55,14 @@ struct BackwardKernel { } void forward_kernel(cudaStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { DataTypeDispatch1{}(m->data_type, stream, m, input, output); } void backward_kernel(cudaStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { DataTypeDispatch1{}(m->data_type, stream, m, input, output); diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 0cbb919529..66c6fae0bd 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -16,8 +16,6 @@ #include "reshape.h" #include "kernels/reshape_kernels.h" #include "legion/legion_utilities.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names @@ -114,8 +112,8 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { acc.get_argument(PER_DEVICE_STATE); Profiling profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor(INPUT); - auto output_grad = acc.get_tensor(OUTPUT); + auto input_grad = acc.get_tensor(INPUT); + auto output_grad = acc.get_tensor(OUTPUT); return profile(backward_kernel, profiling, From 803f0c041581696be35154a39d62b5e7cd19580b Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 17 Oct 2023 21:25:34 +0000 Subject: [PATCH 13/16] format the code --- lib/kernels/src/cuda/reshape_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu index 941d431a7c..49b6a1f5ba 100644 --- a/lib/kernels/src/cuda/reshape_kernels.cu +++ b/lib/kernels/src/cuda/reshape_kernels.cu @@ -55,7 +55,7 @@ struct BackwardKernel { } void forward_kernel(cudaStream_t stream, - ReshapePerDeviceState const &m, + ReshapePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { DataTypeDispatch1{}(m->data_type, stream, m, input, output); From d9db77703614ac05124d9fddf08a81ee67c9935a Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 18 Oct 2023 23:29:21 +0000 Subject: [PATCH 14/16] fix the error --- lib/kernels/src/cuda/reshape_kernels.cu | 5 ++--- lib/runtime/src/ops/reshape.cc | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu index 49b6a1f5ba..b52822a7fc 100644 --- a/lib/kernels/src/cuda/reshape_kernels.cu +++ b/lib/kernels/src/cuda/reshape_kernels.cu @@ -42,7 +42,6 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - ReshapePerDeviceState const *m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { float alpha = 1.0f; @@ -58,14 +57,14 @@ void forward_kernel(cudaStream_t stream, ReshapePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, m, input, output); } void backward_kernel(cudaStream_t stream, ReshapePerDeviceState const &m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, m, input, output); } } // namespace Reshape diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 66c6fae0bd..794a36ef62 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -112,8 +112,8 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { acc.get_argument(PER_DEVICE_STATE); Profiling profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor(INPUT); - auto output_grad = acc.get_tensor(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, From 2e3c715f8c16aa9de4dae4ef3db8e12964780a8e Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 19 Oct 2023 19:29:58 +0000 Subject: [PATCH 15/16] format the code --- lib/runtime/src/ops/reshape.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/reshape.cc b/lib/runtime/src/ops/reshape.cc index 794a36ef62..c9dc8cff8d 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/runtime/src/ops/reshape.cc @@ -112,7 +112,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { acc.get_argument(PER_DEVICE_STATE); Profiling profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, From 33ed1638b419b848c1653e13d087c4b0e6a40a6e Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 19 Oct 2023 21:14:04 +0000 Subject: [PATCH 16/16] format the code --- lib/kernels/src/cuda/reshape_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kernels/src/cuda/reshape_kernels.cu b/lib/kernels/src/cuda/reshape_kernels.cu index b52822a7fc..e935b0d0c2 100644 --- a/lib/kernels/src/cuda/reshape_kernels.cu +++ b/lib/kernels/src/cuda/reshape_kernels.cu @@ -57,14 +57,14 @@ void forward_kernel(cudaStream_t stream, ReshapePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m.data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, input, output); } void backward_kernel(cudaStream_t stream, ReshapePerDeviceState const &m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(m.data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, input, output); } } // namespace Reshape