From dbfc5472072e91d68721f3a100c060b4d337ab78 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Mon, 9 Oct 2023 19:14:49 +0000 Subject: [PATCH 1/9] init draft and leave some comment --- deps/fmt | 2 +- .../include/kernels/layer_norm_kernels.h | 37 +++++++++- lib/kernels/src/cuda/layer_norm_kernels.cu | 37 +++++++++- lib/runtime/src/ops/layer_norm.cc | 69 +++++++++++++++++++ 4 files changed, 140 insertions(+), 5 deletions(-) diff --git a/deps/fmt b/deps/fmt index f5e54359df..a33701196a 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 +Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index a49e1b3483..cd07a6878c 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -2,6 +2,8 @@ #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H #include "kernels/device.h" +#include "kernels/allocation.h" +#include "kernels/ff_handle.h" namespace FlexFlow { @@ -23,18 +25,49 @@ class LayerNormPerDeviceState : public PerDeviceOpState { DataType data_type; }; +struct LayerNormPerDeviceState { + bool elementwise_affine; + int64_t effective_batch_size, effective_num_elements; + float eps; + float *mean, *rstd, *ds, *db, *scale, *bias; + DataType data_type; +}; + +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, + elementwise_affine, + effective_batch_size, + effective_num_elements, + eps, + mean, + rstd, + ds, + db, + scale, + bias, + data_type); + + namespace Kernels { namespace LayerNorm { +//todo: this may have some problem. +LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &, + Allocator const &, + bool elementwise_affine, + int64_t effective_batch_size, + int64_t effective_num_elements, + float eps); + + void forward_kernel(ffStream_t stream, - LayerNormPerDeviceState const *m, + LayerNormPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, GenericTensorAccessorW const &gamma, GenericTensorAccessorW const &beta); void backward_kernel(ffStream_t stream, - LayerNormPerDeviceState const *m, + LayerNormPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &input, GenericTensorAccessorW const &input_grad, diff --git a/lib/kernels/src/cuda/layer_norm_kernels.cu b/lib/kernels/src/cuda/layer_norm_kernels.cu index 65d33bec5e..7dc447d511 100644 --- a/lib/kernels/src/cuda/layer_norm_kernels.cu +++ b/lib/kernels/src/cuda/layer_norm_kernels.cu @@ -48,6 +48,39 @@ LayerNormPerDeviceState::LayerNormPerDeviceState( namespace Kernels { namespace LayerNorm { +//todo: this may have some problem. +LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const & handle, + Allocator const & allocator, + bool elementwise_affine_, + int64_t effective_batch_size_, + int64_t effective_num_elements_, + float eps_) { + elementwise_affine = elementwise_affine_; + effective_batch_size = effective_batch_size_; + effective_num_elements = effective_num_elements_; + eps = eps_; + mean = allocator.allocate(sizeof(float) * effective_batch_size); + rstd = allocator.allocate(sizeof(float) * effective_batch_size); + ds= allocator.allocate(sizeof(float) * effective_batch_size); + db = allocator.allocate(sizeof(float) * effective_batch_size); + scale= allocator.allocate(sizeof(float) * effective_batch_size); + bias = allocator.allocate(sizeof(float) * effective_batch_size); + LayerNormPerDeviceState per_device_state = LayerNormPerDeviceState(handle, + elementwise_affine, + effective_batch_size, + effective_num_elements, + eps, + mean, + rstd, + ds, + db, + scale, + bias); + return per_device_state; + + } + + template struct ForwardKernel { void operator()(cudaStream_t stream, @@ -137,7 +170,7 @@ struct BackwardKernel { } void forward_kernel(cudaStream_t stream, - LayerNormPerDeviceState const *m, + LayerNormPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, GenericTensorAccessorW const &gamma, @@ -147,7 +180,7 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - LayerNormPerDeviceState const *m, + LayerNormPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &input, GenericTensorAccessorW const &input_grad, diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 98aabb6fc5..2380a4ceb5 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -16,6 +16,8 @@ #include "layer_norm.h" #include "kernels/layer_norm_kernels.h" #include "legion/legion_utilities.h" +#include "op-attrs/ops/layer_norm.h" +#include "utils/exception.decl.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -578,4 +580,71 @@ Op *LayerNorm::materialize(FFModel &ff, ff, params, inputs[0], this->name, true /*allocate_weights*/); } +enum Slots {INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; + +OpTaskInvocation init(LayerNormAttrs const & attrs) { + OpTaskBinding b; + + b.bind_arg(HANDLE, ff_handle()); + b.bind_arg(ATTRS, attrs); + + return {LAYERNORM_INIT_TASK_ID, b}; +} + +static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { + auto const &attrs = acc.get_argument(ATTRS); + Allocator allocator = acc.get_allocator(); + FFHandler handle = acc.get_argument(HANDLE); + //question: how to get batch_size and effective_num_elements + int64_t effective_batch_size, effective_num_elements; + + DeviceSpecific per_device_state = + acc.create_device_specific( + init_kernel(handle, + allocator, + attrs.elementwise_affine, + effective_batch_size, + effective_num_elements, + attrs.eps) + ); +} + +static DeviceSpecific init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + return init_task_impl(acc); +} + +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + LayerNormAttrs const & attrs, + ParallelTensorShape const &input_shape, + ProfilingSettings const &settings, + MachineView const &machine_view) { + auto env = sim.new_environment(); + ParallelTensorShape output_shape =get_output_shape(attrs, input_shape); + + SimTaskBinding init_binding; + init_binding.bind_arg(HANDLE, ff_handle()); + init_binding.bind_arg(ATTRS, attrs); + + auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); + + DeviceSpecific = init_task_impl(init_accessor); + +} + +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT); + init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); + + init.add_return_value(); + + register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init, init_task); +} + + }; // namespace FlexFlow From 51c351b0031d88e025ee73b45c7d5f09a9d17d1d Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Mon, 9 Oct 2023 19:29:06 +0000 Subject: [PATCH 2/9] implement the forward and backward --- lib/runtime/src/ops/layer_norm.cc | 110 +++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 2380a4ceb5..30ccd60a7e 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -19,6 +19,7 @@ #include "op-attrs/ops/layer_norm.h" #include "utils/exception.decl.h" #include "utils/hash-utils.h" +#include namespace FlexFlow { @@ -591,6 +592,86 @@ OpTaskInvocation init(LayerNormAttrs const & attrs) { return {LAYERNORM_INIT_TASK_ID, b}; } +OpTaskInvocation forward(LayerNormAttrs const & attrs) { + OpTaskBinding b; + + b.bind(INPUT, input_tensor(0)); + b.bind(OUTPUT, output_tensor(0)); + b.bind(GAMMA, weight_tensor(0));//todo, this may have some problem + b.bind(BETA, weight_tensor(1));//how to get gmmam and beta + b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(PER_DEVICE_STATE, per_device_state()); + + return {LAYERNORM_FWD_TASK_ID, b}; +} + +OpTaskInvocation backward(LayerNormAttrs const & attrs) { + OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); + + return {LAYERNORM_BWD_TASK_ID, b}; +} + + +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto gamma = acc.get_tensor(GAMMA); + auto beta = acc.get_tensor(BETA); + + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto &state = acc.get_argument(PER_DEVICE_STATE); + + return profile(forward_kernel, + profiling, + "[LayerNorm] forward time = %.2lfms\n", + state, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr()); +} + +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); +} + + +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto gamma = acc.get_tensor(GAMMA); + + auto input_grad = acc.get_tensor(INPUT_GRAD); + auto gamma_grad = acc.get_tensor(GAMMA_GRAD); + auto beta_grad = acc.get_tensor(BETA_GRAD); + auto output_grad = acc.get_tensor(OUTPUT_GRAD); + + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto &state = acc.get_argument(PER_DEVICE_STATE); + + return profile(backward_kernel, + profiling, + "[LayerNorm] backward time = %.2lfms\n", + state, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr()); +} + +static void backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} + static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); @@ -631,7 +712,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); - DeviceSpecific = init_task_impl(init_accessor); + DeviceSpecific = init_task_impl(init_accessor); + + } @@ -646,5 +729,30 @@ void register_task() { register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init, init_task); } +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + //how to hande gamma and beta, this may have some problem + fwd.add_input_slot(GAMMA); + fwd.add_input_slot(BETA); + + fwd.add_arg_slot(PROFILING); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + + register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd, forward_task); +} + +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(LAYERNORM_FWD_TASK_ID)); + + register_task(LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task); +} + + }; // namespace FlexFlow From bbe08aa4dc4ac1d0915d8bc7ed9d4192ff2d8939 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Mon, 9 Oct 2023 19:31:02 +0000 Subject: [PATCH 3/9] layer norm version 0.1 --- lib/runtime/src/ops/layer_norm.cc | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 30ccd60a7e..335d542ffc 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -714,8 +714,21 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, DeviceSpecific = init_task_impl(init_accessor); - + SimTaskBinding fwd_binding; + fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(OUTPUT, output_shape); + //TODO how to handle gamma and beta, where are they from + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(LAYERNORM_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } template <> From 1e52a7748d80419d01a40488ceedeab81a607cff Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Mon, 9 Oct 2023 19:33:07 +0000 Subject: [PATCH 4/9] layer norm draft --- .../include/kernels/layer_norm_kernels.h | 26 +- lib/kernels/src/cuda/layer_norm_kernels.cu | 47 +- lib/runtime/src/ops/layer_norm.cc | 695 ++---------------- 3 files changed, 103 insertions(+), 665 deletions(-) diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index cd07a6878c..3a998a74a5 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H -#include "kernels/device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" namespace FlexFlow { @@ -26,11 +26,11 @@ class LayerNormPerDeviceState : public PerDeviceOpState { }; struct LayerNormPerDeviceState { - bool elementwise_affine; - int64_t effective_batch_size, effective_num_elements; - float eps; - float *mean, *rstd, *ds, *db, *scale, *bias; - DataType data_type; + bool elementwise_affine; + int64_t effective_batch_size, effective_num_elements; + float eps; + float *mean, *rstd, *ds, *db, *scale, *bias; + DataType data_type; }; FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, @@ -46,18 +46,16 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); - namespace Kernels { namespace LayerNorm { -//todo: this may have some problem. +// todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &, - Allocator const &, - bool elementwise_affine, - int64_t effective_batch_size, - int64_t effective_num_elements, - float eps); - + Allocator const &, + bool elementwise_affine, + int64_t effective_batch_size, + int64_t effective_num_elements, + float eps); void forward_kernel(ffStream_t stream, LayerNormPerDeviceState const &m, diff --git a/lib/kernels/src/cuda/layer_norm_kernels.cu b/lib/kernels/src/cuda/layer_norm_kernels.cu index 7dc447d511..f8331abe29 100644 --- a/lib/kernels/src/cuda/layer_norm_kernels.cu +++ b/lib/kernels/src/cuda/layer_norm_kernels.cu @@ -48,38 +48,37 @@ LayerNormPerDeviceState::LayerNormPerDeviceState( namespace Kernels { namespace LayerNorm { -//todo: this may have some problem. -LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const & handle, - Allocator const & allocator, - bool elementwise_affine_, - int64_t effective_batch_size_, - int64_t effective_num_elements_, - float eps_) { +// todo: this may have some problem. +LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, + Allocator const &allocator, + bool elementwise_affine_, + int64_t effective_batch_size_, + int64_t effective_num_elements_, + float eps_) { elementwise_affine = elementwise_affine_; effective_batch_size = effective_batch_size_; effective_num_elements = effective_num_elements_; eps = eps_; mean = allocator.allocate(sizeof(float) * effective_batch_size); rstd = allocator.allocate(sizeof(float) * effective_batch_size); - ds= allocator.allocate(sizeof(float) * effective_batch_size); + ds = allocator.allocate(sizeof(float) * effective_batch_size); db = allocator.allocate(sizeof(float) * effective_batch_size); - scale= allocator.allocate(sizeof(float) * effective_batch_size); + scale = allocator.allocate(sizeof(float) * effective_batch_size); bias = allocator.allocate(sizeof(float) * effective_batch_size); - LayerNormPerDeviceState per_device_state = LayerNormPerDeviceState(handle, - elementwise_affine, - effective_batch_size, - effective_num_elements, - eps, - mean, - rstd, - ds, - db, - scale, - bias); - return per_device_state; - - } - + LayerNormPerDeviceState per_device_state = + LayerNormPerDeviceState(handle, + elementwise_affine, + effective_batch_size, + effective_num_elements, + eps, + mean, + rstd, + ds, + db, + scale, + bias); + return per_device_state; +} template struct ForwardKernel { diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 335d542ffc..7757f726e3 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -17,573 +17,15 @@ #include "kernels/layer_norm_kernels.h" #include "legion/legion_utilities.h" #include "op-attrs/ops/layer_norm.h" -#include "utils/exception.decl.h" +#include "utils/exceptions.h" #include "utils/hash-utils.h" #include namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - -using namespace FlexFlow::Kernels::LayerNorm; - -LayerNormParams LayerNorm::get_params() const { - LayerNormParams params; - params.layer_guid = this->layer_guid; - params.axes = this->axes; - params.elementwise_affine = this->elementwise_affine; - params.eps = this->eps; - return params; -} - -Tensor FFModel::layer_norm(const Tensor input, - std::vector const &axes, - bool elementwise_affine, - float eps, - char const *name) { - // FIXME: currently disable elementwise_affine - elementwise_affine = false; - // axes must be the last axes.size() dimensions - for (int i = 0; i < axes.size(); i++) { - bool found = false; - for (int j = 0; j < axes.size(); j++) { - if (axes[j] == input->num_dims - 1 - i) { - found = true; - } - } - if (!found) { - assert(false && "axes must be the last axes.size() dimensions"); - } - } - int num_weights = elementwise_affine ? 2 : 0; - Layer *ln = new Layer(this, - OP_LAYERNORM, - DT_FLOAT, - name, - 1 /*inputs*/, - num_weights, - 1 /*outputs*/, - input); - ln->outputs[0] = create_tensor_legion_ordering(input->num_dims, - input->dims, - input->data_type, - ln, - 0, - true /*create_grad*/); - if (num_weights == 2) { - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= input->dims[input->num_dims - 1 - axes[i]]; - } - int dims[1] = {M}; - ln->weights[0] = create_weight_legion_ordering(1, - dims, - input->data_type, - ln, - true /*create_grad*/, - nullptr, - CHOSEN_SYNC_TYPE); - ln->weights[1] = create_weight_legion_ordering(1, - dims, - input->data_type, - ln, - true /*create_grad*/, - nullptr, - CHOSEN_SYNC_TYPE); - } - ln->add_int_property("elementwise_affine", elementwise_affine); - ln->add_int_vector_property("axes", axes); - ln->add_float_property("eps", eps); - layers.push_back(ln); - return ln->outputs[0]; -} - -Op *LayerNorm::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("elementwise_affine", value); - bool elementwise_affine = (bool)value; - std::vector axes; - layer->get_int_vector_property("axes", axes); - float eps; - layer->get_float_property("eps", eps); - return new LayerNorm(model, - layer->layer_guid, - inputs[0], - axes, - elementwise_affine, - eps, - false, // allocate_weights - layer->name); -} - -LayerNorm::LayerNorm(FFModel &model, - LayerNormParams const ¶ms, - ParallelTensor const input, - char const *name, - bool allocate_weights) - : LayerNorm(model, - params.layer_guid, - input, - params.axes, - params.elementwise_affine, - params.eps, - allocate_weights, - name) {} - -LayerNorm::LayerNorm(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor _input, - std::vector const &_axes, - bool _elementwise_affine, - float _eps, - bool allocate_weights, - char const *name) - : Op(model, - OP_LAYERNORM, - _input->data_type, - name, - 1 /*inputs*/, - _elementwise_affine ? 2 : 0 /*weights*/, - 1 /*outputs*/, - _input), - elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes) { - // overwrite layer_guid - layer_guid = _layer_guid; - outputs[0] = model.create_parallel_tensor_legion_ordering( - _input->num_dims, _input->dims, _input->data_type, this); - assert(check_output_input_weight_parallel_dims(allocate_weights)); - ParallelDim output_dims[MAX_TENSOR_DIM]; - int M = 1; - for (int i = 0; i < axes.size(); i++) { - M *= inputs[0]->dims[inputs[0]->num_dims - 1 - axes[i]].size; - } - effective_num_elements = M; - effective_batch_size = inputs[0]->get_volume() / M; - if (numWeights > 0 && allocate_weights) { - int kernel_dims = 2; - assert(false); - // weights[0] = model.create_parallel_weight_legion_ordering( - // kernel_dims, - } else { - // do nothing - } - return; -} - -void LayerNorm::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(LAYERNORM_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(LayerNorm)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(1, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -PerDeviceOpState * - LayerNorm::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - LayerNorm *ln = (LayerNorm *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - LayerNormMeta *meta = new LayerNormMeta(handle, ln); - return meta; -} - -void LayerNorm::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(LAYERNORM_FWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - if (elementwise_affine) { - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); - } - runtime->execute_index_space(ctx, launcher); -} - -/* - regions[0](I): input - regions[1](O): output - regions[2](I/O): gamma - regions[3](I/O): beta -*/ -void LayerNorm::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); - assert(task->regions.size() == regions.size()); - float const *in_ptr = NULL; - float *out_ptr = NULL, *gamma_ptr = NULL, *beta_ptr = NULL; - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - out_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - assert(in_domain == out_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); - if (m->elementwise_affine) { - assert(regions.size() == 4); - Domain gamma_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); - Domain beta_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - beta_ptr = helperGetTensorPointerRW( - regions[3], task->regions[3], FID_DATA, ctx, runtime); - assert(gamma_domain == beta_domain); - assert(gamma_domain.get_volume() == m->effective_num_elements); - } else { - assert(regions.size() == 2); - } - - forward_kernel_wrapper(m, in_ptr, out_ptr, gamma_ptr, beta_ptr); -} - -void LayerNorm::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(LAYERNORM_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[1](I): input - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(1, FID_DATA); - // regions[2](I/O): input_grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(2, FID_DATA); - if (elementwise_affine) { - // regions[3](I): gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(3, FID_DATA); - // regions[4](I/O): gamma_grad - launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[0]->region_grad)); - launcher.add_field(4, FID_DATA); - // regions[5](I/O): beta_grad - launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[1]->region_grad)); - launcher.add_field(5, FID_DATA); - } - runtime->execute_index_space(ctx, launcher); -} +enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; -/* - regions[0](I): output_grad - regions[1](I): input - regions[2](I/O): input_grad - regions[3](I): gamma - regions[4](I/O): gamma_grad - regions[5](I/O): beta_grad - */ -void LayerNorm::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); - assert(task->regions.size() == regions.size()); - float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL; - float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; - Domain out_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - Domain in_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - in_grad_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); - assert(in_domain == out_grad_domain); - assert(in_domain.get_volume() == - m->effective_num_elements * m->effective_batch_size); - if (m->elementwise_affine) { - assert(regions.size() == 6); - Domain gamma_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRO( - regions[3], task->regions[3], FID_DATA, ctx, runtime); - Domain gamma_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); - gamma_grad_ptr = helperGetTensorPointerRW( - regions[4], task->regions[4], FID_DATA, ctx, runtime); - Domain beta_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); - beta_grad_ptr = helperGetTensorPointerRW( - regions[5], task->regions[5], FID_DATA, ctx, runtime); - assert(gamma_domain == gamma_grad_domain); - assert(gamma_domain == beta_grad_domain); - assert(gamma_domain.get_volume() == m->effective_num_elements); - } else { - assert(regions.size() == 3); - } - - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); -} - -bool LayerNorm::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - LayerNormMeta *m = new LayerNormMeta(sim->handler, this); - - sim->free_all(); - float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(in_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *out_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(out_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - // FIXME please add gamma_ptr and beta_ptr after finish the implementation - float *gamma_ptr = NULL, *beta_ptr = NULL; - - bool out_of_memory = - (in_ptr == NULL) || (out_ptr == NULL) || - (((gamma_ptr == NULL) || (beta_ptr == NULL)) && (m->elementwise_affine)); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(m, in_ptr, out_ptr, gamma_ptr, beta_ptr); - }; - - if (sim->computationMode == COMP_MODE_TRAINING) { - float *in_grad_ptr = - (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(in_grad_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - float *out_grad_ptr = NULL; - out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(out_grad_ptr != NULL); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - - float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; - - out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || - (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && - (m->elementwise_affine)); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - backward = [&] { - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " - "forward_time(%.4lf) backward_time(%.4lf)\n", - name, - sub_output.get_volume(), - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - log_measure.debug("[Measure LayerNorm] name(%s) num_elements(%zu) " - "forward_time(%.4lf)\n", - name, - sub_output.get_volume(), - cost_metrics.forward_time); - } - - return true; -} - -void LayerNorm::serialize(Legion::Serializer &sez) const { - sez.serialize(this->layer_guid.id); - sez.serialize(this->axes.size()); - for (size_t i = 0; i < this->axes.size(); i++) { - sez.serialize(this->axes[i]); - } - sez.serialize(this->elementwise_affine); - sez.serialize(this->eps); -} - -using PCG::Node; -/*static*/ -Node LayerNorm::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - size_t num_axes; - std::vector axes; - bool elementwise_affine; - float eps; - size_t id; - dez.deserialize(id); - LayerID layer_guid(id); - dez.deserialize(num_axes); - for (size_t i = 0; i < num_axes; i++) { - int axis_idx; - dez.deserialize(axis_idx); - axes.push_back(axis_idx); - } - dez.deserialize(elementwise_affine); - dez.deserialize(eps); - - LayerNormParams params; - params.layer_guid = layer_guid; - params.axes = axes; - params.elementwise_affine = elementwise_affine; - params.eps = eps; - return ff.get_or_create_node(inputs[0], params); -} - -Op *LayerNorm::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - LayerNormParams params = get_params(); - return new LayerNorm( - ff, params, inputs[0], this->name, true /*allocate_weights*/); -} - -enum Slots {INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; - -OpTaskInvocation init(LayerNormAttrs const & attrs) { +OpTaskInvocation init(LayerNormAttrs const &attrs) { OpTaskBinding b; b.bind_arg(HANDLE, ff_handle()); @@ -592,43 +34,42 @@ OpTaskInvocation init(LayerNormAttrs const & attrs) { return {LAYERNORM_INIT_TASK_ID, b}; } -OpTaskInvocation forward(LayerNormAttrs const & attrs) { +OpTaskInvocation forward(LayerNormAttrs const &attrs) { OpTaskBinding b; b.bind(INPUT, input_tensor(0)); b.bind(OUTPUT, output_tensor(0)); - b.bind(GAMMA, weight_tensor(0));//todo, this may have some problem - b.bind(BETA, weight_tensor(1));//how to get gmmam and beta + b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem + b.bind(BETA, weight_tensor(1)); // how to get gmmam and beta b.bind_arg(PROFILING, profiling_settings()); b.bind_arg(PER_DEVICE_STATE, per_device_state()); return {LAYERNORM_FWD_TASK_ID, b}; } -OpTaskInvocation backward(LayerNormAttrs const & attrs) { +OpTaskInvocation backward(LayerNormAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); return {LAYERNORM_BWD_TASK_ID, b}; } - static optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto gamma = acc.get_tensor(GAMMA); - auto beta = acc.get_tensor(BETA); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto &state = acc.get_argument(PER_DEVICE_STATE); - - return profile(forward_kernel, - profiling, - "[LayerNorm] forward time = %.2lfms\n", - state, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - beta.get_float_ptr()); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto gamma = acc.get_tensor(GAMMA); + auto beta = acc.get_tensor(BETA); + + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto &state = acc.get_argument(PER_DEVICE_STATE); + + return profile(forward_kernel, + profiling, + "[LayerNorm] forward time = %.2lfms\n", + state, + input.get_float_ptr(), + output.get_float_ptr(), + gamma.get_float_ptr(), + beta.get_float_ptr()); } static void forward_task(Task const *task, @@ -639,7 +80,6 @@ static void forward_task(Task const *task, forward_task_impl(acc); } - static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto gamma = acc.get_tensor(GAMMA); @@ -653,15 +93,15 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto &state = acc.get_argument(PER_DEVICE_STATE); return profile(backward_kernel, - profiling, - "[LayerNorm] backward time = %.2lfms\n", - state, - output_grad.get_float_ptr(), - input.get_float_ptr(), - input_grad.get_float_ptr(), - gamma.get_float_ptr(), - gamma_grad.get_float_ptr(), - beta_grad.get_float_ptr()); + profiling, + "[LayerNorm] backward time = %.2lfms\n", + state, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr()); } static void backward_task(Task const *task, @@ -672,25 +112,26 @@ static void backward_task(Task const *task, backward_task_impl(acc); } -static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); FFHandler handle = acc.get_argument(HANDLE); - //question: how to get batch_size and effective_num_elements + // question: how to get batch_size and effective_num_elements int64_t effective_batch_size, effective_num_elements; - DeviceSpecific per_device_state = + DeviceSpecific per_device_state = acc.create_device_specific( - init_kernel(handle, - allocator, - attrs.elementwise_affine, - effective_batch_size, - effective_num_elements, - attrs.eps) - ); + init_kernel(handle, + allocator, + attrs.elementwise_affine, + effective_batch_size, + effective_num_elements, + attrs.eps)); } -static DeviceSpecific init_task(Task const *task, +static DeviceSpecific + init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { @@ -699,42 +140,43 @@ static DeviceSpecific init_task(Task const *task, } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, - LayerNormAttrs const & attrs, + LayerNormAttrs const &attrs, ParallelTensorShape const &input_shape, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); - ParallelTensorShape output_shape =get_output_shape(attrs, input_shape); + auto env = sim.new_environment(); + ParallelTensorShape output_shape = get_output_shape(attrs, input_shape); - SimTaskBinding init_binding; - init_binding.bind_arg(HANDLE, ff_handle()); - init_binding.bind_arg(ATTRS, attrs); + SimTaskBinding init_binding; + init_binding.bind_arg(HANDLE, ff_handle()); + init_binding.bind_arg(ATTRS, attrs); - auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); + auto init_accessor = + env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); - DeviceSpecific = init_task_impl(init_accessor); + DeviceSpecific = init_task_impl(init_accessor); - SimTaskBinding fwd_binding; - fwd_binding.bind(INPUT, input_shape); - fwd_binding.bind(OUTPUT, output_shape); - //TODO how to handle gamma and beta, where are they from + SimTaskBinding fwd_binding; + fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(OUTPUT, output_shape); + // TODO how to handle gamma and beta, where are they from - SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_bwd_accessor(LAYERNORM_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(LAYERNORM_BWD_TASK_ID, bwd_binding); - float forward_time = forward_task_impl(fwd_accessor).value(); - float backward_time = backward_task_impl(bwd_accessor).value(); + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); - float sync_time = default_estimate_sync_time(env); - return make_metrics(forward_time, backward_time, sync_time, env); + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } template <> void register_task() { OpTaskSignature init(OpTaskType::INIT); - init.add_arg_slot(ATTRS); + init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); @@ -748,7 +190,7 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - //how to hande gamma and beta, this may have some problem + // todo how to hande gamma and beta, this may have some problem fwd.add_input_slot(GAMMA); fwd.add_input_slot(BETA); @@ -763,9 +205,8 @@ void register_task() { OpTaskSignature bwd = infer_bwd_signature(get_op_signature(LAYERNORM_FWD_TASK_ID)); - register_task(LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task); + register_task( + LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task); } - - }; // namespace FlexFlow From ea9bfa5607cbca09722e40aedf228d7b220cacce Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 11 Oct 2023 15:22:38 +0000 Subject: [PATCH 5/9] fix the layer norm and leave beta, gamma --- lib/runtime/src/ops/layer_norm.cc | 35 ++++++++++++++++++++++++++----- lib/runtime/src/ops/layer_norm.h | 2 +- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 7757f726e3..129857080e 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -17,10 +17,16 @@ #include "kernels/layer_norm_kernels.h" #include "legion/legion_utilities.h" #include "op-attrs/ops/layer_norm.h" +#include "op-attrs/parallel_tensor_shape.h" #include "utils/exceptions.h" #include "utils/hash-utils.h" #include +using Legion::Context; +using Legion::PhysicalRegion; +using Legion::Runtime; +using Legion::Task; + namespace FlexFlow { enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; @@ -28,6 +34,8 @@ enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; OpTaskInvocation init(LayerNormAttrs const &attrs) { OpTaskBinding b; + b.bind(INPUT, input_tensor(0)); + b.bind_arg(HANDLE, ff_handle()); b.bind_arg(ATTRS, attrs); @@ -116,9 +124,21 @@ static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); + auto input = acc.get_tensor(INPUT); FFHandler handle = acc.get_argument(HANDLE); + // question: how to get batch_size and effective_num_elements int64_t effective_batch_size, effective_num_elements; + int M = 1; + for (int i = 0; i < attrs.axes.size(); i++) { + M *= input.shape.at(legion_dim_t(attrs.axes[i])); + } + int num_replicas = 1; + for (int i = 0; i < intput.shape.num_dims(); i++) { + num_replicas *= input.shape.at(legion_dim_t(i)); + } + effective_num_elements = M; + effective_batch_size = input.shape.get_volume() / num_replicas / M; DeviceSpecific per_device_state = acc.create_device_specific( @@ -141,15 +161,16 @@ static DeviceSpecific CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LayerNormAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { auto env = sim.new_environment(); - ParallelTensorShape output_shape = get_output_shape(attrs, input_shape); + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; init_binding.bind_arg(HANDLE, ff_handle()); init_binding.bind_arg(ATTRS, attrs); + init.binding.bind(INPUT, input.shape); auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); @@ -157,8 +178,11 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, DeviceSpecific = init_task_impl(init_accessor); SimTaskBinding fwd_binding; - fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(INPUT, input.shape); fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); + // TODO how to handle gamma and beta, where are they from SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); @@ -176,6 +200,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { OpTaskSignature init(OpTaskType::INIT); + init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); @@ -191,8 +216,8 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); // todo how to hande gamma and beta, this may have some problem - fwd.add_input_slot(GAMMA); - fwd.add_input_slot(BETA); + fwd.add_weight_slot(GAMMA); + fwd.add_weight_slot(BETA); fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/runtime/src/ops/layer_norm.h index 3cae9e3eb7..a9b00ee4e3 100644 --- a/lib/runtime/src/ops/layer_norm.h +++ b/lib/runtime/src/ops/layer_norm.h @@ -20,7 +20,7 @@ OpTaskInvocation backward(LayerNormAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LayerNormAttrs const &, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input_shape, ProfilingSettings const &settings, MachineView const &machine_view); From 146095bde97b4b6ad264e1bafafdec2e9dc130b0 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 17 Jan 2024 13:47:27 -0800 Subject: [PATCH 6/9] Finish layer norm --- lib/runtime/src/ops/layer_norm.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 129857080e..93fc9899a7 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -64,8 +64,8 @@ OpTaskInvocation backward(LayerNormAttrs const &attrs) { static optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - auto gamma = acc.get_tensor(GAMMA); - auto beta = acc.get_tensor(BETA); + auto gamma = acc.get_tensor(GAMMA); + auto beta = acc.get_tensor(BETA); ProfilingSettings profiling = acc.get_argument(PROFILING); auto &state = acc.get_argument(PER_DEVICE_STATE); @@ -136,9 +136,8 @@ static DeviceSpecific int num_replicas = 1; for (int i = 0; i < intput.shape.num_dims(); i++) { num_replicas *= input.shape.at(legion_dim_t(i)); - } effective_num_elements = M; - effective_batch_size = input.shape.get_volume() / num_replicas / M; + effective_batch_size = input.shape.get_volume() / M; DeviceSpecific per_device_state = acc.create_device_specific( @@ -184,7 +183,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); // TODO how to handle gamma and beta, where are they from - +fwd_binding.bind(GAMMA, input_shape); +fwd_binding.bind(BETA, input_shape); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); From 0de86209c799d70899e34ea6f8fecc78fcad9791 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 7 Feb 2024 08:55:20 -0500 Subject: [PATCH 7/9] refine the layernorm --- deps/fmt | 2 +- lib/runtime/src/ops/layer_norm.cc | 54 +++++++++++++++++++------------ lib/runtime/src/ops/layer_norm.h | 2 +- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/deps/fmt b/deps/fmt index a33701196a..f5e54359df 160000 --- a/deps/fmt +++ b/deps/fmt @@ -1 +1 @@ -Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 +Subproject commit f5e54359df4c26b6230fc61d38aa294581393084 diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 93fc9899a7..26dc049d4d 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -29,7 +29,7 @@ using Legion::Task; namespace FlexFlow { -enum Slots { INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; +enum Slots { PROFILING, INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; OpTaskInvocation init(LayerNormAttrs const &attrs) { OpTaskBinding b; @@ -148,6 +148,7 @@ static DeviceSpecific effective_num_elements, attrs.eps)); } + } static DeviceSpecific init_task(Task const *task, @@ -198,40 +199,53 @@ fwd_binding.bind(BETA, input_shape); } template <> -void register_task() { +OpTaskSignature fwd_signature() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + fwd.add_weight_slot(GAMMA); + fwd.add_weight_slot(BETA); + + fwd.add_arg_slot(PROFILING); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + return fwd; +} + + +template <> +OpTaskSignature bwd_signature() { + OpTaskSignature bwd = infer_bwd_signature(fwd_signature()); + return bwd; +} + +template <> +OpTaskSignature init_signatur() { OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); - - register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init, init_task); + return init; } template <> -void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - // todo how to hande gamma and beta, this may have some problem - fwd.add_weight_slot(GAMMA); - fwd.add_weight_slot(BETA); +void register_task() { - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signatur(), init_task); +} - register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd, forward_task); +template <> +void register_task() { + register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd_signature() , forward_task); } template <> void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(LAYERNORM_FWD_TASK_ID)); - register_task( - LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd, backward_task); + LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd_signatur() , backward_task); +} + } -}; // namespace FlexFlow diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/runtime/src/ops/layer_norm.h index a9b00ee4e3..83e6733bf6 100644 --- a/lib/runtime/src/ops/layer_norm.h +++ b/lib/runtime/src/ops/layer_norm.h @@ -20,7 +20,7 @@ OpTaskInvocation backward(LayerNormAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LayerNormAttrs const &, - InputParallelTensorDesc const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); From 0cd1ca38b8271deb5a8afe3e2a2f58081fa8eb3c Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 7 Feb 2024 12:59:57 -0800 Subject: [PATCH 8/9] Apply suggestions from code review --- lib/runtime/src/ops/layer_norm.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 26dc049d4d..a380e4dc66 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -214,13 +214,13 @@ OpTaskSignature fwd_signature() { template <> -OpTaskSignature bwd_signature() { +OpTaskSignature bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(fwd_signature()); return bwd; } template <> -OpTaskSignature init_signatur() { +OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); @@ -233,7 +233,7 @@ OpTaskSignature init_signatur() { template <> void register_task() { - register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signatur(), init_task); + register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signature(), init_task); } template <> @@ -244,7 +244,7 @@ void register_task() { template <> void register_task() { register_task( - LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd_signatur() , backward_task); + LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd_signature() , backward_task); } } From 8adb275e90e34423a8751d80638f1a23567ef9be Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 7 Feb 2024 18:21:04 -0500 Subject: [PATCH 9/9] fix the typo and format --- lib/runtime/src/ops/layer_norm.cc | 66 +++++++++++++++++++------------ 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/runtime/src/ops/layer_norm.cc index 26dc049d4d..6bc671c249 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/runtime/src/ops/layer_norm.cc @@ -29,7 +29,16 @@ using Legion::Task; namespace FlexFlow { -enum Slots { PROFILING, INPUT, OUTPUT, GAMMA, BETA, PER_DEVICE_STATE, ATTRS, HANDLE }; +enum Slots { + PROFILING, + INPUT, + OUTPUT, + GAMMA, + BETA, + PER_DEVICE_STATE, + ATTRS, + HANDLE +}; OpTaskInvocation init(LayerNormAttrs const &attrs) { OpTaskBinding b; @@ -136,19 +145,19 @@ static DeviceSpecific int num_replicas = 1; for (int i = 0; i < intput.shape.num_dims(); i++) { num_replicas *= input.shape.at(legion_dim_t(i)); - effective_num_elements = M; - effective_batch_size = input.shape.get_volume() / M; - - DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - attrs.elementwise_affine, - effective_batch_size, - effective_num_elements, - attrs.eps)); + effective_num_elements = M; + effective_batch_size = input.shape.get_volume() / M; + + DeviceSpecific per_device_state = + acc.create_device_specific( + init_kernel(handle, + allocator, + attrs.elementwise_affine, + effective_batch_size, + effective_num_elements, + attrs.eps)); + } } - } static DeviceSpecific init_task(Task const *task, @@ -184,8 +193,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); // TODO how to handle gamma and beta, where are they from -fwd_binding.bind(GAMMA, input_shape); -fwd_binding.bind(BETA, input_shape); + fwd_binding.bind(GAMMA, input_shape); + fwd_binding.bind(BETA, input_shape); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); @@ -212,15 +221,15 @@ OpTaskSignature fwd_signature() { return fwd; } - template <> -OpTaskSignature bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(fwd_signature()); +OpTaskSignature bwd_signature() { + OpTaskSignature bwd = + infer_bwd_signature(fwd_signature()); return bwd; } template <> -OpTaskSignature init_signatur() { +OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); @@ -233,19 +242,26 @@ OpTaskSignature init_signatur() { template <> void register_task() { - register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signatur(), init_task); + register_task(LAYERNORM_INIT_TASK_ID, + "LayerNorm init", + init_signature(), + init_task); } template <> void register_task() { - register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd_signature() , forward_task); + register_task(LAYERNORM_FWD_TASK_ID, + "LayerNorm forward", + fwd_signature(), + forward_task); } template <> void register_task() { - register_task( - LAYERNORM_BWD_TASK_ID, "LayerNorm backward", bwd_signatur() , backward_task); -} - + register_task(LAYERNORM_BWD_TASK_ID, + "LayerNorm backward", + bwd_signature(), + backward_task); } +} // namespace FlexFlow