From 109c9f4375d318abc04a8d1fee80edc2c916cc9e Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 2 Aug 2023 00:30:50 +0000 Subject: [PATCH 01/13] start to local backend for aggregator --- lib/CMakeLists.txt | 10 +++--- .../src/task_spec/task_argument_accessor.h | 32 +++++++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 57d9edab3d..93dbf50f80 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,8 +1,8 @@ -add_subdirectory(pcg) -add_subdirectory(compiler) -# add_subdirectory(runtime) -add_subdirectory(op-attrs) -add_subdirectory(kernels) +#add_subdirectory(pcg) +#add_subdirectory(compiler) + add_subdirectory(runtime) +#add_subdirectory(op-attrs) +#add_subdirectory(kernels) add_subdirectory(utils) # add_subdirectory(ffi) add_subdirectory(substitutions) \ No newline at end of file diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h index 4a4cf64512..7d323430db 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ b/lib/runtime/src/task_spec/task_argument_accessor.h @@ -70,6 +70,36 @@ region_idx_t get_region_idx(TaskArgumentsFormat const &, parallel_tensor_guid_t const &); DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &); + +struct ITaskArgumentAccessor { +template T const &get_argument(slot_id slot) const; +template optional get_optional_argument(slot_id slot) const; +template std::vector get_variadic_argument(slot_id slot) const; +template get_tensor(slot_id slot) const ; +template + privilege_mode_to_accessor get_tensor_grad(slot_id slot) const; +template + std::vector> + get_variadic_tensor(slot_id slot) const; + template +std::vector> + get_variadic_tensor_grad(slot_id slot) const; +}; + +struct ILegionTaskArgumentAccessor: public ITaskArgumentAccessor{ + +private: + Legion::Task const *task; + std::vector const ®ions; + Legion::Context ctx; + Legion::Runtime *runtime; + TaskArgumentsFormat const &args_fmt; +}; + +struct ILocalTaskArgumentAccessor: public ITaskArgumentAccessor{ + +}; + struct TaskArgumentAccessor { TaskArgumentAccessor(Legion::Task const *task, std::vector const ®ions, @@ -168,6 +198,8 @@ struct TaskArgumentAccessor { Legion::Context ctx; Legion::Runtime *runtime; TaskArgumentsFormat const &args_fmt; + //cow_ptr_t const & ITaskArgumentAccesor; + cow_ptr_t const & ptr; }; } // namespace FlexFlow From ad7b7f337b4fe1cb4656a37d4050d8c18acef071 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 2 Aug 2023 02:21:30 +0000 Subject: [PATCH 02/13] add ITaskArgumentAccessor, LocalTaskArgumentAccessor, LegionTaskArgumentAccessor --- lib/runtime/src/sim_environment.cc | 26 ++ lib/runtime/src/sim_environment.h | 11 +- .../src/task_spec/task_argument_accessor.cc | 39 +++ .../src/task_spec/task_argument_accessor.h | 294 ++++++++++++------ 4 files changed, 267 insertions(+), 103 deletions(-) create mode 100644 lib/runtime/src/sim_environment.cc create mode 100644 lib/runtime/src/task_spec/task_argument_accessor.cc diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc new file mode 100644 index 0000000000..a3a14a0b64 --- /dev/null +++ b/lib/runtime/src/sim_environment.cc @@ -0,0 +1,26 @@ +#include "sim_environment.h" + +namespaec FlexFlow { + +void SimTaskBinding::bind(slot_id id, ParallelTensorShape const & shape) { + tensor_shape_bindings.insert(id, shape); +} +void SimTaskBinding::bind(slot_id id, TensorShape const & shape) { + tensor_shape_bindings.insert(id, shape); +} + +void SimTaskBinding::bind(slot_id id, InputVariadicParallelTensorDesc const & desc) { + this->tensor_shape_bindings.insert(id, desc); +} + +void SimTaskBinding::bind_arg(slot_id id, SimArg const & arg) { + arg_bindings.insert(id, arg); +} + +TaskArgumentAccessor SimEnvironment::get_fwd_accessor(task_id_t tid, SimTaskBinding const & sim_task_binding) { + NOT_IMPLEMENTED();//TODO +} + + + +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/runtime/src/sim_environment.h b/lib/runtime/src/sim_environment.h index d08bef653f..7af7a84125 100644 --- a/lib/runtime/src/sim_environment.h +++ b/lib/runtime/src/sim_environment.h @@ -4,9 +4,11 @@ #include "cost_metrics.h" #include "kernels/accessor.h" #include "kernels/allocation.h" +#include "kernels/profiling.h" #include "op-attrs/parallel_tensor_shape.h" #include "task_spec/op_task_invocation.h" #include "task_spec/task_argument_accessor.h" +#include #include namespace FlexFlow { @@ -23,6 +25,9 @@ struct InputVariadicParallelTensorDesc { IsTrainable trainable; }; +using SimArg = variant; +using SimTensorSpec = variant; + struct SimTaskBinding { void bind(slot_id, ParallelTensorShape const &); void bind_untrainable(slot_id, ParallelTensorShape const &); @@ -33,9 +38,11 @@ struct SimTaskBinding { void bind_untrainable(slot_id, std::vector const &); void bind(slot_id, std::vector const &, IsTrainable); void bind(slot_id, InputVariadicParallelTensorDesc const &); + + template void bind_arg(slot_id id, T const & name); - template - void bind_arg(slot_id, T const &); + std::unordered_map arg_bindings; + std::unordered_map tensor_shape_bindings; }; SimTaskBinding infer_bwd_binding(SimTaskBinding const &); diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc new file mode 100644 index 0000000000..7427347456 --- /dev/null +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -0,0 +1,39 @@ +#include "task_argument_accessor.h" + +using namespace FlexFlow { + +template T const & LocalTaskArgumentAccessor::get_argument(slot_id slot) const{ + if(slot == PROFILING) { + return get(this->arg_bindings.at(slot)); + } elif (slot == ATTRS) { + return get(this->arg_bindings.at(slot)); + } else { + throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_argument"); + } +} + +template privilege_mode_to_accessor LocalTaskArgumentAccessor::get_tensor(slot_id slot) const { + SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot); +// NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor +//lib/runtime/src/accessor.h +// template <> +// struct privilege_mode_to_accessor_t { +// using type = GenericTensorAccessorW; +// }; + +// template <> +// struct privilege_mode_to_accessor_t { +// using type = GenericTensorAccessorR; +// }; + +// template <> +// struct privilege_mode_to_accessor_t { +// using type = GenericTensorAccessorW; +// }; + +// template +// using privilege_mode_to_accessor = +// typename privilege_mode_to_accessor_t::type; +} + +}//namespace FlexFlow \ No newline at end of file diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h index 7d323430db..16e34dca02 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ b/lib/runtime/src/task_spec/task_argument_accessor.h @@ -72,21 +72,83 @@ DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &); struct ITaskArgumentAccessor { -template T const &get_argument(slot_id slot) const; -template optional get_optional_argument(slot_id slot) const; -template std::vector get_variadic_argument(slot_id slot) const; -template get_tensor(slot_id slot) const ; -template - privilege_mode_to_accessor get_tensor_grad(slot_id slot) const; -template - std::vector> - get_variadic_tensor(slot_id slot) const; - template -std::vector> - get_variadic_tensor_grad(slot_id slot) const; +virtual template T const &get_argument(slot_id slot) const = 0; + + +virtual template + privilege_mode_to_accessor get_tensor(slot_id slot) const = 0; }; -struct ILegionTaskArgumentAccessor: public ITaskArgumentAccessor{ +struct LegionTaskArgumentAccessor: public ITaskArgumentAccessor{ + +public: +virtual template T const &get_argument(slot_id slot) const override; +virtual template privilege_mode_to_accessor get_tensor(slot_id slot) const override; + +// template +// T const &get_argument(slot_id slot) const { +// TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); +// std::type_index actual_type = arg_fmt.type; +// std::type_index requested_type = {typeid(T)}; + +// if (actual_type != requested_type) { +// throw mk_runtime_error( +// "Type mismatch in argument access (\"{}\" != \"{}\")", +// actual_type.name(), +// requested_type.name()); +// } + +// void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; +// Legion::Deserializer dez(start_ptr, arg_fmt.size()); + +// return ff_task_deserialize(dez); +// } + +LegionTaskArgumentAccessor(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime):task(task), regions(regions), ctx(ctx), runtime(runtime){} + +// template privilege_mode_to_accessor +// get_generic_accessor(region_idx_t const &idx) const { +// auto tensor_privs = get_permissions(this->args_fmt, idx); +// if (tensor_privs != PRIV) { +// throw mk_runtime_error( +// "Privilege mismatch while accessing tensor: {} != {}", +// tensor_privs, +// PRIV); +// } + +// return helperGetGenericTensorAccessor( +// get_datatype(this->args_fmt, idx), +// regions[idx.value()], +// task->regions[idx.value()], +// FID_DATA, +// ctx, +// runtime); +// } + +// template +// privilege_mode_to_accessor get_tensor(slot_id slot) const { +// auto argument_format = +// get(this->args_fmt.region_idxs.at(slot)); + +// return this->get_generic_accessor(argument_format); +// } + +// template +// std::vector> +// get_variadic_tensor(slot_id slot) const { +// std::vector> result; + +// auto argument_format = +// get(this->args_fmt.region_idxs.at(slot)); +// for (NonvariadicFormat const &argument : argument_format) { +// result.push_back(this->get_generic_accessor(argument)); +// } + +// return result; +// } private: Legion::Task const *task; @@ -96,110 +158,140 @@ struct ILegionTaskArgumentAccessor: public ITaskArgumentAccessor{ TaskArgumentsFormat const &args_fmt; }; -struct ILocalTaskArgumentAccessor: public ITaskArgumentAccessor{ +struct LocalTaskArgumentAccessor: public ITaskArgumentAccessor{ -}; +public: +virtual template T const &get_argument(slot_id slot) const override; +virtual template privilege_mode_to_accessor get_tensor(slot_id slot) const override; -struct TaskArgumentAccessor { - TaskArgumentAccessor(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); +LocalTaskArgumentAccessor(std::shared_ptr & sim_task_binding):sim_task_binding(sim_task_binding){} - template - T const &get_argument(slot_id slot) const { - TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); - std::type_index actual_type = arg_fmt.type; - std::type_index requested_type = {typeid(T)}; - - if (actual_type != requested_type) { - throw mk_runtime_error( - "Type mismatch in argument access (\"{}\" != \"{}\")", - actual_type.name(), - requested_type.name()); - } - - void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; - Legion::Deserializer dez(start_ptr, arg_fmt.size()); - - return ff_task_deserialize(dez); - } +private: + std::shared_ptr sim_task_binding; - template - optional get_optional_argument(slot_id) const { - NOT_IMPLEMENTED(); - } +}; - template - std::vector get_variadic_argument(slot_id) const { - NOT_IMPLEMENTED(); +struct TaskArgumentAccessor { + // TaskArgumentAccessor(Legion::Task const *task, + // std::vector const ®ions, + // Legion::Context ctx, + // Legion::Runtime *runtime); + template + T const &get_argument(slot_id slot) const { + return this->ptr->get_argument(slot); } - template - privilege_mode_to_accessor - get_generic_accessor(region_idx_t const &idx) const { - auto tensor_privs = get_permissions(this->args_fmt, idx); - if (tensor_privs != PRIV) { - throw mk_runtime_error( - "Privilege mismatch while accessing tensor: {} != {}", - tensor_privs, - PRIV); - } - - return helperGetGenericTensorAccessor( - get_datatype(this->args_fmt, idx), - regions[idx.value()], - task->regions[idx.value()], - FID_DATA, - ctx, - runtime); - } - template + template privilege_mode_to_accessor get_tensor(slot_id slot) const { - auto argument_format = - get(this->args_fmt.region_idxs.at(slot)); - - return this->get_generic_accessor(argument_format); + return this->ptr->get_tensor(slot); } - template - privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { - NOT_IMPLEMENTED(); - } - template - std::vector> - get_variadic_tensor(slot_id slot) const { - std::vector> result; - auto argument_format = - get(this->args_fmt.region_idxs.at(slot)); - for (NonvariadicFormat const &argument : argument_format) { - result.push_back(this->get_generic_accessor(argument)); - } - - return result; + // template + // T const &get_argument(slot_id slot) const { + // TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); + // std::type_index actual_type = arg_fmt.type; + // std::type_index requested_type = {typeid(T)}; + + // if (actual_type != requested_type) { + // throw mk_runtime_error( + // "Type mismatch in argument access (\"{}\" != \"{}\")", + // actual_type.name(), + // requested_type.name()); + // } + + // void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; + // Legion::Deserializer dez(start_ptr, arg_fmt.size()); + + // return ff_task_deserialize(dez); + // } + + // template + // optional get_optional_argument(slot_id) const { + // NOT_IMPLEMENTED(); + // } + + // template + // std::vector get_variadic_argument(slot_id) const { + // NOT_IMPLEMENTED(); + // } + + // template + // privilege_mode_to_accessor + // get_generic_accessor(region_idx_t const &idx) const { + // auto tensor_privs = get_permissions(this->args_fmt, idx); + // if (tensor_privs != PRIV) { + // throw mk_runtime_error( + // "Privilege mismatch while accessing tensor: {} != {}", + // tensor_privs, + // PRIV); + // } + + // return helperGetGenericTensorAccessor( + // get_datatype(this->args_fmt, idx), + // regions[idx.value()], + // task->regions[idx.value()], + // FID_DATA, + // ctx, + // runtime); + // } + + // template + // privilege_mode_to_accessor get_tensor(slot_id slot) const { + // auto argument_format = + // get(this->args_fmt.region_idxs.at(slot)); + + // return this->get_generic_accessor(argument_format); + // } + + // template + // privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { + // NOT_IMPLEMENTED(); + // } + + // template + // std::vector> + // get_variadic_tensor(slot_id slot) const { + // std::vector> result; + + // auto argument_format = + // get(this->args_fmt.region_idxs.at(slot)); + // for (NonvariadicFormat const &argument : argument_format) { + // result.push_back(this->get_generic_accessor(argument)); + // } + + // return result; + // } + + // template + // std::vector> + // get_variadic_tensor_grad(slot_id slot) const { + // NOT_IMPLEMENTED(); + // } + + // size_t get_device_idx() const { + // NOT_IMPLEMENTED(); + // } + template + static + typename std::enable_if::value, + TaskArgumentAccessor>::type + create(Args &&...args) { + return TaskArgumentAccessor( + std::make_shared(std::forward(args)...)); } - - template - std::vector> - get_variadic_tensor_grad(slot_id slot) const { - NOT_IMPLEMENTED(); - } - - size_t get_device_idx() const { - NOT_IMPLEMENTED(); - } - private: - Legion::Task const *task; - std::vector const ®ions; - Legion::Context ctx; - Legion::Runtime *runtime; - TaskArgumentsFormat const &args_fmt; + // Legion::Task const *task; + // std::vector const ®ions; + // Legion::Context ctx; + // Legion::Runtime *runtime; + // TaskArgumentsFormat const &args_fmt; //cow_ptr_t const & ITaskArgumentAccesor; - cow_ptr_t const & ptr; + //cow_ptr_t const & ptr; + TaskArgumentAccessor(std::shared_ptr & ptr): ptr(ptr) {} + std::shared_ptr ptr; }; } // namespace FlexFlow From f106c9a2af394b63570abd586688c8c01e6d90bd Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 3 Aug 2023 11:16:14 +0000 Subject: [PATCH 03/13] implement some logic LocalTaskArgumentAccessor::get_tensor --- lib/runtime/src/ops/aggregate.cc | 2 ++ .../src/task_spec/task_argument_accessor.cc | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/lib/runtime/src/ops/aggregate.cc b/lib/runtime/src/ops/aggregate.cc index e9874c877a..c1136f513e 100644 --- a/lib/runtime/src/ops/aggregate.cc +++ b/lib/runtime/src/ops/aggregate.cc @@ -467,6 +467,8 @@ CostMetrics fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(ATTRS, attrs); + auto fwd_accessor = env.get_fwd_accessor(AGGREGATE_FWD_TASK_ID, fwd_binding); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index 7427347456..c4fb086a52 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -14,6 +14,25 @@ template T const & LocalTaskArgumentAccessor::get_argument(slot_id template privilege_mode_to_accessor LocalTaskArgumentAccessor::get_tensor(slot_id slot) const { SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot); + if(slot == GATE_PREDS) { + InputParallelTensorDesc gate_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); + //TODO, I should convert gate_pred to privilege_mode_to_accessor + NOT_IMPLEMENTED(); + } else if(slot == GATE_ASSIGN) { + InputVariadicParallelTensorDesc gate_assign = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); + //TODO, I should convert gate_assign to privilege_mode_to_accessor + NOT_IMPLEMENTED(); + } else if(slot == EXP_PREDS) { + InputVariadicParallelTensorDesc exp_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); + //TODO, I should convert exp_preds to privilege_mode_to_accessor + NOT_IMPLEMENTED(); + } else if(slot == OUTPUT) { + ParallelTensorShape output_shape = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); + //TODO, I should convert output_shape to privilege_mode_to_accessor + NOT_IMPLEMENTED(); + } else { + throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor"); + } // NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor //lib/runtime/src/accessor.h // template <> From 3e578ccf309ca47962c992372615ea19236debe1 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 8 Aug 2023 09:38:19 +0000 Subject: [PATCH 04/13] add arrayshape --- lib/kernels/src/array_shape.cc | 3 +++ lib/op-attrs/include/op-attrs/parallel_tensor_dims.h | 2 +- lib/runtime/src/task_spec/task_argument_accessor.cc | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 44507c14c4..f7f1f19abc 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -10,4 +10,7 @@ std::size_t ArrayShape::get_volume() const { return product(this->dims); } +ArrayShape::ArrayShape(std::vector const &_dims) + : dims(_dims) {} + } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h index d38ba75232..fba1202734 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h @@ -25,7 +25,7 @@ struct ParallelTensorDims : public use_visitable_cmp { ParallelDim &at(ff_dim_t const &); iterator begin(); - const_iterator begin() const; + const_iterator begin() const; const_iterator cbegin() const; iterator end(); const_iterator end() const; diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index c4fb086a52..de2ee3797a 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -16,6 +16,8 @@ template privilege_mode_to_accessor LocalTaskArgumentAc SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot); if(slot == GATE_PREDS) { InputParallelTensorDesc gate_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); + DataType shape = gate_preds.shape; + //use gate_preds.shape to get the ArrayShape //TODO, I should convert gate_pred to privilege_mode_to_accessor NOT_IMPLEMENTED(); } else if(slot == GATE_ASSIGN) { From 6b7d2b4b34029461d2fb84db848e5a65e012c0f7 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 8 Aug 2023 09:44:14 +0000 Subject: [PATCH 05/13] construct ArrayShape in LocalTaskArgumentAccessor::get_tensor --- lib/op-attrs/include/op-attrs/parallel_tensor_dims.h | 1 + lib/op-attrs/src/parallel_tensor_shape.cc | 8 ++++++++ lib/runtime/src/task_spec/task_argument_accessor.cc | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h index fba1202734..58eae651da 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h @@ -11,6 +11,7 @@ struct ParallelTensorDims : public use_visitable_cmp { size_t get_volume() const; size_t num_dims() const; + std::vector get_dims() const; using iterator = typename FFOrdered::iterator; using const_iterator = typename FFOrdered::const_iterator; diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc index 9a36e7d11b..4556f624f2 100644 --- a/lib/op-attrs/src/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/parallel_tensor_shape.cc @@ -16,6 +16,14 @@ static std::vector lift_dims(TensorDims const &dims) { ParallelTensorDims::ParallelTensorDims(TensorDims const &dims) : data(lift_dims(dims)) {} +std::vector ParallelTensorDims::get_dims() const { + std::vector dims; + for (ParallelDim const &d : this->data) { + dims.push_back(d.size); + } + return dims; +} + ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape) : dims(tensor_shape.dims), data_type(tensor_shape.data_type) {} diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index de2ee3797a..794e130a88 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -17,6 +17,10 @@ template privilege_mode_to_accessor LocalTaskArgumentAc if(slot == GATE_PREDS) { InputParallelTensorDesc gate_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); DataType shape = gate_preds.shape; + ArrayShape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector + //TODO: 1)allocate memory for ptr 2)fill ptr + //question: 1) how much memory should I allocate? 2) how to fill ptr? + //use gate_preds.shape to get the ArrayShape //TODO, I should convert gate_pred to privilege_mode_to_accessor NOT_IMPLEMENTED(); From 062a52ab2c7aabbf6905f09273c131be614d9e28 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Tue, 8 Aug 2023 12:30:23 +0000 Subject: [PATCH 06/13] add allocate for LocalTaskArgumentAccessor --- lib/kernels/src/allocation.cc | 13 +++++++ lib/kernels/src/cuda/accessor.cc | 22 +++++++++++ .../include/op-attrs/parallel_tensor_shape.h | 3 +- lib/op-attrs/src/datatype.cc | 23 +++++++++++ lib/op-attrs/src/parallel_tensor_shape.cc | 10 +++++ lib/runtime/src/cuda_allocator.cc | 17 +++++++++ lib/runtime/src/cuda_allocator.h | 19 ++++++++++ lib/runtime/src/realm_allocator.h | 3 +- .../src/task_spec/task_argument_accessor.cc | 38 +++++++++++-------- .../src/task_spec/task_argument_accessor.h | 12 +++++- 10 files changed, 138 insertions(+), 22 deletions(-) create mode 100644 lib/kernels/src/allocation.cc create mode 100644 lib/kernels/src/cuda/accessor.cc create mode 100644 lib/op-attrs/src/datatype.cc create mode 100644 lib/runtime/src/cuda_allocator.cc create mode 100644 lib/runtime/src/cuda_allocator.h diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc new file mode 100644 index 0000000000..ee9366f045 --- /dev/null +++ b/lib/kernels/src/allocation.cc @@ -0,0 +1,13 @@ +#include "kernels/allocation.h" + +namespace FlexFlow { + +void * Allocator::allocate(size_t size) { + return i_allocator->allocate(size); +} + +void Allocator::deallocate(void *ptr) { + i_allocator->deallocate(ptr); +} + +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/kernels/src/cuda/accessor.cc b/lib/kernels/src/cuda/accessor.cc new file mode 100644 index 0000000000..4fe9a8b568 --- /dev/null +++ b/lib/kernels/src/cuda/accessor.cc @@ -0,0 +1,22 @@ +#include "kernels/accessor.h" + +namespace FlexFlow { + +int32_t *get_int32_ptr(GenericTensorAccessorW const & w) { + return static_cast(w.ptr.value()); //Note(lambda):we use static_cast, may have some problem +} + +int64_t *get_int64_ptr(GenericTensorAccessorW const &w) { + return static_cast(w.ptr.value()); +} +float *get_float_ptr(GenericTensorAccessorW const & w) { + return static_cast(w.ptr.value()); +} +double *get_double_ptr(GenericTensorAccessorW const &w) { + return static_cast(w.ptr.value()); +} +half *get_half_ptr(GenericTensorAccessorW const & w) { + return static_cast(w.ptr.value()); +} + +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h index fd560352bb..eef0034ad5 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h @@ -25,8 +25,7 @@ struct ParallelTensorShape : public use_visitable_cmp { ParallelTensorShape(TensorShape const &); - int num_dims() const; - + int num_dims() const; ParallelDim const &at(ff_dim_t const &) const; ParallelDim &at(ff_dim_t const &); ParallelDim const &operator[](ff_dim_t const &) const; diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc new file mode 100644 index 0000000000..e92563b628 --- /dev/null +++ b/lib/op-attrs/src/datatype.cc @@ -0,0 +1,23 @@ +#include "op-attrs/datatype.h" + +namespace FlexFlow { + +size_t size_of(DataType data_type); + switch (type) { + case DataType::BOOL: + return sizeof(bool); + case DataType::INT32: + return sizeof(int32_t); + case DataType::INT64: + return sizeof(int64_t); + case DataType::HALF: + return sizeof(float) / 2; + case DataType::FLOAT: + return sizeof(float); + case DataType::DOUBLE: + return sizeof(double); + default: + throw mk_runtime_error("Unknown data type"); + } + +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc index 4556f624f2..184a695da5 100644 --- a/lib/op-attrs/src/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/parallel_tensor_shape.cc @@ -24,6 +24,12 @@ std::vector ParallelTensorDims::get_dims() const { return dims; } +size_t ParallelTensorDims::get_volume() const { + return product(transform(this->data, [](ParallelDim const &d) -> size_t { + return d.size; + })); +} + ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape) : dims(tensor_shape.dims), data_type(tensor_shape.data_type) {} @@ -31,6 +37,10 @@ int get_num_replica_dims(ParallelTensorShape const &shape) { return count(shape.dims, is_replica_dim); } +TensorShape get_piece_shape(ParallelTensorShape const & parall_tensor_shape) { + return TensorShape(parall_tensor_shape.dims, parall_tensor_shape.data_type); +} + int get_num_replicas(ParallelTensorShape const &shape) { return product( transform(filter(as_vector(shape.dims), is_replica_dim), diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc new file mode 100644 index 0000000000..41db98926e --- /dev/null +++ b/lib/runtime/src/cuda_allocator.cc @@ -0,0 +1,17 @@ +#include "cuda_allocator.h" + +namespace FlexFlow { + +void * CudaAllocator::allocate(size_t size) { + void *ptr; + check_CUDA(cudaMalloc(&ptr, size)); + return ptr; +} + +void CudaAllocator::deallocate(void *ptr) { + check_CUDA(cudaFree(ptr)); +} + +} // namespace FlexFlow + + diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h new file mode 100644 index 0000000000..4eea9b4207 --- /dev/null +++ b/lib/runtime/src/cuda_allocator.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_RUNTIME_CUDA_ALLOCATOR_H +#define _FLEXFLOW_RUNTIME_CUDA_ALLOCATOR_H + +#include "kernels/allocation.h" +#include + +namespace FlexFlow { + +struct CudaAllocator : public IAllocator { + ~CudaAllocator() override; + + void *allocate(size_t) override; + void deallocate(void *) override; + +}; + +} // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/lib/runtime/src/realm_allocator.h b/lib/runtime/src/realm_allocator.h index 210fd8a050..99ba8a8252 100644 --- a/lib/runtime/src/realm_allocator.h +++ b/lib/runtime/src/realm_allocator.h @@ -15,8 +15,7 @@ struct RealmAllocator : public IAllocator { ~RealmAllocator() override; void *allocate(size_t) override; - void deallocate(void *) override; - + void deallocate(void *) override private: Legion::Memory memory; stack_vector instances; diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index 794e130a88..b777afb637 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -12,32 +12,38 @@ template T const & LocalTaskArgumentAccessor::get_argument(slot_id } } +void * LocalTaskArgumentAccessor::allocate(size_t size) { + void * ptr = local_allocator.allocate(memory_size);//Note: how(when) to free this memory? + void * cpu_ptr = malloc(memory_size); + memset(cpu_ptr, 0, memory_size); + checkCUDA(cudaMemcpy(ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); //fill ptr + free(cpu_ptr); + return ptr; +} + + template privilege_mode_to_accessor LocalTaskArgumentAccessor::get_tensor(slot_id slot) const { SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot); - if(slot == GATE_PREDS) { + if(slot == GATE_PREDS || slot == GATE_ASSIGN ) { InputParallelTensorDesc gate_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); - DataType shape = gate_preds.shape; - ArrayShape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector - //TODO: 1)allocate memory for ptr 2)fill ptr - //question: 1) how much memory should I allocate? 2) how to fill ptr? - - //use gate_preds.shape to get the ArrayShape - //TODO, I should convert gate_pred to privilege_mode_to_accessor - NOT_IMPLEMENTED(); - } else if(slot == GATE_ASSIGN) { - InputVariadicParallelTensorDesc gate_assign = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); - //TODO, I should convert gate_assign to privilege_mode_to_accessor - NOT_IMPLEMENTED(); + DataType data_type = gate_preds.shape.data_type; + ArrayShape array_shape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector + size_t shape_size =gate_preds.shape.dims.get_volume()* size_of(shape); + void * ptr = allocate(shape_size); + return gate_preds_accessor{shape, array_shape, ptr}; } else if(slot == EXP_PREDS) { InputVariadicParallelTensorDesc exp_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); //TODO, I should convert exp_preds to privilege_mode_to_accessor NOT_IMPLEMENTED(); } else if(slot == OUTPUT) { ParallelTensorShape output_shape = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); - //TODO, I should convert output_shape to privilege_mode_to_accessor - NOT_IMPLEMENTED(); + Datatype data_type = output_shape.data_type; + ArrayShape array_shape = {output_shape.dims.get_dims()};//output_shape.dims.get_dims() return std::vector + size_t shape_size = output_shape.dims.get_volume() * size_of(data_type); + void * ptr = allocate(shape_size); + return {shape, array_shape, ptr}; } else { - throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor"); + throw mk_runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor"); } // NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor //lib/runtime/src/accessor.h diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h index 16e34dca02..0229ccd50b 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ b/lib/runtime/src/task_spec/task_argument_accessor.h @@ -7,6 +7,7 @@ #include "utils/exception.h" #include "utils/stack_map.h" #include "utils/strong_typedef.h" +#include "kernels/allocation.h" #include namespace FlexFlow { @@ -164,11 +165,18 @@ struct LocalTaskArgumentAccessor: public ITaskArgumentAccessor{ virtual template T const &get_argument(slot_id slot) const override; virtual template privilege_mode_to_accessor get_tensor(slot_id slot) const override; -LocalTaskArgumentAccessor(std::shared_ptr & sim_task_binding):sim_task_binding(sim_task_binding){} +LocalTaskArgumentAccessor(std::shared_ptr & sim_task_binding):sim_task_binding(sim_task_binding){ + local_allocator = Allocator::create(); +} + +void * allocate(size_t size); + +void deallocate(void * ptr); private: std::shared_ptr sim_task_binding; - + // CudaAllocator local_allocator; + Allocator local_allocator; }; struct TaskArgumentAccessor { From 7ef9afc38de8081bd77ada0fd20824d6995d577c Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 9 Aug 2023 01:56:52 +0000 Subject: [PATCH 07/13] add get_variadic_tensor method for ITaskArgumentAccessor --- lib/CMakeLists.txt | 4 +- lib/kernels/src/allocation.cc | 2 +- lib/kernels/src/array_shape.cc | 3 +- lib/kernels/src/cuda/accessor.cc | 17 +- .../include/op-attrs/parallel_tensor_dims.h | 4 +- .../include/op-attrs/parallel_tensor_shape.h | 2 +- lib/op-attrs/include/op-attrs/tensor_shape.h | 2 +- lib/op-attrs/src/datatype.cc | 7 +- lib/op-attrs/src/parallel_tensor_shape.cc | 12 +- lib/runtime/src/cuda_allocator.cc | 4 +- lib/runtime/src/cuda_allocator.h | 1 - lib/runtime/src/realm_allocator.h | 4 +- lib/runtime/src/sim_environment.cc | 26 +- lib/runtime/src/sim_environment.h | 9 +- .../src/task_spec/task_argument_accessor.cc | 126 ++++----- .../src/task_spec/task_argument_accessor.h | 239 ++++-------------- 16 files changed, 167 insertions(+), 295 deletions(-) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index e137f0dfc3..ece639cb6a 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,8 +1,8 @@ #add_subdirectory(pcg) #add_subdirectory(compiler) add_subdirectory(runtime) -#add_subdirectory(op-attrs) -#add_subdirectory(kernels) +add_subdirectory(op-attrs) +add_subdirectory(kernels) add_subdirectory(utils) # add_subdirectory(ffi) #add_subdirectory(substitutions) diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ee9366f045..eb90b0cb69 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -2,7 +2,7 @@ namespace FlexFlow { -void * Allocator::allocate(size_t size) { +void *Allocator::allocate(size_t size) { return i_allocator->allocate(size); } diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index f7f1f19abc..67c3de54dd 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -10,7 +10,6 @@ std::size_t ArrayShape::get_volume() const { return product(this->dims); } -ArrayShape::ArrayShape(std::vector const &_dims) - : dims(_dims) {} +ArrayShape::ArrayShape(std::vector const &_dims) : dims(_dims) {} } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/accessor.cc b/lib/kernels/src/cuda/accessor.cc index 4fe9a8b568..1559c10350 100644 --- a/lib/kernels/src/cuda/accessor.cc +++ b/lib/kernels/src/cuda/accessor.cc @@ -2,21 +2,22 @@ namespace FlexFlow { -int32_t *get_int32_ptr(GenericTensorAccessorW const & w) { - return static_cast(w.ptr.value()); //Note(lambda):we use static_cast, may have some problem +int32_t *get_int32_ptr(GenericTensorAccessorW const &w) { + return static_cast( + w.ptr.value()); // Note(lambda):we use static_cast, may have some problem } int64_t *get_int64_ptr(GenericTensorAccessorW const &w) { - return static_cast(w.ptr.value()); + return static_cast(w.ptr.value()); } -float *get_float_ptr(GenericTensorAccessorW const & w) { - return static_cast(w.ptr.value()); +float *get_float_ptr(GenericTensorAccessorW const &w) { + return static_cast(w.ptr.value()); } double *get_double_ptr(GenericTensorAccessorW const &w) { - return static_cast(w.ptr.value()); + return static_cast(w.ptr.value()); } -half *get_half_ptr(GenericTensorAccessorW const & w) { - return static_cast(w.ptr.value()); +half *get_half_ptr(GenericTensorAccessorW const &w) { + return static_cast(w.ptr.value()); } } // namespace FlexFlow \ No newline at end of file diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h index 58eae651da..aeec0c6e01 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h @@ -11,7 +11,7 @@ struct ParallelTensorDims : public use_visitable_cmp { size_t get_volume() const; size_t num_dims() const; - std::vector get_dims() const; + std::vector get_dims() const; using iterator = typename FFOrdered::iterator; using const_iterator = typename FFOrdered::const_iterator; @@ -26,7 +26,7 @@ struct ParallelTensorDims : public use_visitable_cmp { ParallelDim &at(ff_dim_t const &); iterator begin(); - const_iterator begin() const; + const_iterator begin() const; const_iterator cbegin() const; iterator end(); const_iterator end() const; diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h index eef0034ad5..182b2169c3 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h @@ -25,7 +25,7 @@ struct ParallelTensorShape : public use_visitable_cmp { ParallelTensorShape(TensorShape const &); - int num_dims() const; + int num_dims() const; ParallelDim const &at(ff_dim_t const &) const; ParallelDim &at(ff_dim_t const &); ParallelDim const &operator[](ff_dim_t const &) const; diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index fa34860817..ab0f958b42 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -16,7 +16,7 @@ struct TensorShape : public use_visitable_cmp { template TensorShape(Dims const &dims, DataType data_type) - : dims(dims), data_type(data_type) {} + : dims(this->dims), data_type(this->data_type) {} size_t at(ff_dim_t) const; size_t operator[](ff_dim_t) const; diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc index e92563b628..fc47ef47eb 100644 --- a/lib/op-attrs/src/datatype.cc +++ b/lib/op-attrs/src/datatype.cc @@ -2,8 +2,8 @@ namespace FlexFlow { -size_t size_of(DataType data_type); - switch (type) { +size_t size_of(DataType data_type) { + switch (data_type) { case DataType::BOOL: return sizeof(bool); case DataType::INT32: @@ -17,7 +17,8 @@ size_t size_of(DataType data_type); case DataType::DOUBLE: return sizeof(double); default: - throw mk_runtime_error("Unknown data type"); + throw mk_runtime_error("Unknown data type"); } +} } // namespace FlexFlow \ No newline at end of file diff --git a/lib/op-attrs/src/parallel_tensor_shape.cc b/lib/op-attrs/src/parallel_tensor_shape.cc index 184a695da5..0f4121973f 100644 --- a/lib/op-attrs/src/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/parallel_tensor_shape.cc @@ -25,9 +25,13 @@ std::vector ParallelTensorDims::get_dims() const { } size_t ParallelTensorDims::get_volume() const { - return product(transform(this->data, [](ParallelDim const &d) -> size_t { - return d.size; - })); + + // this function can use contains.h to optimize the code + size_t volume = 1; + for (ParallelDim const &d : this->data) { + volume *= d.size; + } + return volume; } ParallelTensorShape::ParallelTensorShape(TensorShape const &tensor_shape) @@ -37,7 +41,7 @@ int get_num_replica_dims(ParallelTensorShape const &shape) { return count(shape.dims, is_replica_dim); } -TensorShape get_piece_shape(ParallelTensorShape const & parall_tensor_shape) { +TensorShape get_piece_shape(ParallelTensorShape const ¶ll_tensor_shape) { return TensorShape(parall_tensor_shape.dims, parall_tensor_shape.data_type); } diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc index 41db98926e..5a4529c8ed 100644 --- a/lib/runtime/src/cuda_allocator.cc +++ b/lib/runtime/src/cuda_allocator.cc @@ -2,7 +2,7 @@ namespace FlexFlow { -void * CudaAllocator::allocate(size_t size) { +void *CudaAllocator::allocate(size_t size) { void *ptr; check_CUDA(cudaMalloc(&ptr, size)); return ptr; @@ -13,5 +13,3 @@ void CudaAllocator::deallocate(void *ptr) { } } // namespace FlexFlow - - diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h index 4eea9b4207..cb3c8e1c19 100644 --- a/lib/runtime/src/cuda_allocator.h +++ b/lib/runtime/src/cuda_allocator.h @@ -11,7 +11,6 @@ struct CudaAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; - }; } // namespace FlexFlow diff --git a/lib/runtime/src/realm_allocator.h b/lib/runtime/src/realm_allocator.h index 99ba8a8252..95957b643d 100644 --- a/lib/runtime/src/realm_allocator.h +++ b/lib/runtime/src/realm_allocator.h @@ -15,9 +15,7 @@ struct RealmAllocator : public IAllocator { ~RealmAllocator() override; void *allocate(size_t) override; - void deallocate(void *) override -private: - Legion::Memory memory; + void deallocate(void *) override private : Legion::Memory memory; stack_vector instances; }; diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc index a3a14a0b64..8233735396 100644 --- a/lib/runtime/src/sim_environment.cc +++ b/lib/runtime/src/sim_environment.cc @@ -2,25 +2,25 @@ namespaec FlexFlow { -void SimTaskBinding::bind(slot_id id, ParallelTensorShape const & shape) { + void SimTaskBinding::bind(slot_id id, ParallelTensorShape const &shape) { tensor_shape_bindings.insert(id, shape); -} -void SimTaskBinding::bind(slot_id id, TensorShape const & shape) { + } + void SimTaskBinding::bind(slot_id id, TensorShape const &shape) { tensor_shape_bindings.insert(id, shape); -} + } -void SimTaskBinding::bind(slot_id id, InputVariadicParallelTensorDesc const & desc) { + void SimTaskBinding::bind(slot_id id, + InputVariadicParallelTensorDesc const &desc) { this->tensor_shape_bindings.insert(id, desc); -} + } -void SimTaskBinding::bind_arg(slot_id id, SimArg const & arg) { + void SimTaskBinding::bind_arg(slot_id id, SimArg const &arg) { arg_bindings.insert(id, arg); -} - -TaskArgumentAccessor SimEnvironment::get_fwd_accessor(task_id_t tid, SimTaskBinding const & sim_task_binding) { - NOT_IMPLEMENTED();//TODO -} - + } + TaskArgumentAccessor SimEnvironment::get_fwd_accessor( + task_id_t tid, SimTaskBinding const &sim_task_binding) { + NOT_IMPLEMENTED(); // TODO + } } // namespace FlexFlow \ No newline at end of file diff --git a/lib/runtime/src/sim_environment.h b/lib/runtime/src/sim_environment.h index 7af7a84125..57b00eda34 100644 --- a/lib/runtime/src/sim_environment.h +++ b/lib/runtime/src/sim_environment.h @@ -26,7 +26,9 @@ struct InputVariadicParallelTensorDesc { }; using SimArg = variant; -using SimTensorSpec = variant; +using SimTensorSpec = variant; struct SimTaskBinding { void bind(slot_id, ParallelTensorShape const &); @@ -38,8 +40,9 @@ struct SimTaskBinding { void bind_untrainable(slot_id, std::vector const &); void bind(slot_id, std::vector const &, IsTrainable); void bind(slot_id, InputVariadicParallelTensorDesc const &); - - template void bind_arg(slot_id id, T const & name); + + template + void bind_arg(slot_id id, T const &name); std::unordered_map arg_bindings; std::unordered_map tensor_shape_bindings; diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index b777afb637..55d66b035a 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -2,69 +2,77 @@ using namespace FlexFlow { -template T const & LocalTaskArgumentAccessor::get_argument(slot_id slot) const{ - if(slot == PROFILING) { - return get(this->arg_bindings.at(slot)); - } elif (slot == ATTRS) { - return get(this->arg_bindings.at(slot)); - } else { - throw std::runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_argument"); + template + T const &LocalTaskArgumentAccessor::get_argument(slot_id slot) const { + if (slot == PROFILING) { + return get(this->arg_bindings.at(slot)); } -} - -void * LocalTaskArgumentAccessor::allocate(size_t size) { - void * ptr = local_allocator.allocate(memory_size);//Note: how(when) to free this memory? - void * cpu_ptr = malloc(memory_size); - memset(cpu_ptr, 0, memory_size); - checkCUDA(cudaMemcpy(ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); //fill ptr - free(cpu_ptr); - return ptr; -} + elif (slot == ATTRS) { + return get(this->arg_bindings.at(slot)); + } + else { + throw std::runtime_error( + "Unknown Slot ID in LocalTaskArgumentAccessor::get_argument"); + } + } + void *LocalTaskArgumentAccessor::allocate(size_t size) { + void *ptr = local_allocator.allocate( + memory_size); // Note: how(when) to free this memory? + void *cpu_ptr = malloc(memory_size); + memset(cpu_ptr, 0, memory_size); + checkCUDA(cudaMemcpy( + ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); // fill ptr + free(cpu_ptr); + return ptr; + } -template privilege_mode_to_accessor LocalTaskArgumentAccessor::get_tensor(slot_id slot) const { - SimTensorSpec const & spec = this->tensor_shape_bindings.at(slot); - if(slot == GATE_PREDS || slot == GATE_ASSIGN ) { - InputParallelTensorDesc gate_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); - DataType data_type = gate_preds.shape.data_type; - ArrayShape array_shape = {gate_preds.shape.dims.get_dims()};//gate_preds.shape.dims.get_dims() return std::vector - size_t shape_size =gate_preds.shape.dims.get_volume()* size_of(shape); - void * ptr = allocate(shape_size); - return gate_preds_accessor{shape, array_shape, ptr}; - } else if(slot == EXP_PREDS) { - InputVariadicParallelTensorDesc exp_preds = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); - //TODO, I should convert exp_preds to privilege_mode_to_accessor - NOT_IMPLEMENTED(); - } else if(slot == OUTPUT) { - ParallelTensorShape output_shape = get(this->sim_task_binding->tensor_shape_bindings.at(slot)); - Datatype data_type = output_shape.data_type; - ArrayShape array_shape = {output_shape.dims.get_dims()};//output_shape.dims.get_dims() return std::vector - size_t shape_size = output_shape.dims.get_volume() * size_of(data_type); - void * ptr = allocate(shape_size); - return {shape, array_shape, ptr}; + template + privilege_mode_to_accessor LocalTaskArgumentAccessor::get_tensor( + slot_id slot) const { + SimTensorSpec const &spec = this->tensor_shape_bindings.at(slot); + if (slot == GATE_PREDS || slot == GATE_ASSIGN) { + InputParallelTensorDesc gate_preds = get( + this->sim_task_binding->tensor_shape_bindings.at(slot)); + DataType data_type = gate_preds.shape.data_type; + ArrayShape array_shape = { + gate_preds.shape.dims.get_dims()}; // gate_preds.shape.dims.get_dims() + // return std::vector + size_t shape_size = gate_preds.shape.dims.get_volume() * size_of(shape); + void *ptr = allocate(shape_size); + return gate_preds_accessor{shape, array_shape, ptr}; + } else if (slot == OUTPUT) { + ParallelTensorShape output_shape = get( + this->sim_task_binding->tensor_shape_bindings.at(slot)); + Datatype data_type = output_shape.data_type; + ArrayShape array_shape = { + output_shape.dims.get_dims()}; // output_shape.dims.get_dims() return + // std::vector + size_t shape_size = output_shape.dims.get_volume() * size_of(data_type); + void *ptr = allocate(shape_size); + return {shape, array_shape, ptr}; } else { - throw mk_runtime_error("Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor"); + throw mk_runtime_error( + "Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor"); } -// NOT_IMPLEMENTED();//TODO, I should convert spec to privilege_mode_to_accessor -//lib/runtime/src/accessor.h -// template <> -// struct privilege_mode_to_accessor_t { -// using type = GenericTensorAccessorW; -// }; + } -// template <> -// struct privilege_mode_to_accessor_t { -// using type = GenericTensorAccessorR; -// }; - -// template <> -// struct privilege_mode_to_accessor_t { -// using type = GenericTensorAccessorW; -// }; - -// template -// using privilege_mode_to_accessor = -// typename privilege_mode_to_accessor_t::type; -} + template + std::vector> get_variadic_tensor( + slot_id slot) const override { + std::vector> result; + InputVariadicParallelTensorDesc const &spec = + get( + this->sim_task_binding->tensor_shape_bindings.at(slot)); + for (auto const &shape : spec.shapes) { + ArrayShape array_shape = { + shape.dims + .get_dims()}; // shape.dims.get_dims() return std::vector + size_t shape_size = shape.dims.get_volume() * size_of(shape.data_type); + void *ptr = allocate(shape_size); + result.push_back({shape, array_shape, ptr}); + } + return result; + } -}//namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h index 0229ccd50b..d640fc04a9 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ b/lib/runtime/src/task_spec/task_argument_accessor.h @@ -2,12 +2,12 @@ #define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H #include "accessor.h" +#include "kernels/allocation.h" #include "runtime/config.h" #include "task_invocation.h" #include "utils/exception.h" #include "utils/stack_map.h" #include "utils/strong_typedef.h" -#include "kernels/allocation.h" #include namespace FlexFlow { @@ -71,85 +71,33 @@ region_idx_t get_region_idx(TaskArgumentsFormat const &, parallel_tensor_guid_t const &); DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &); - struct ITaskArgumentAccessor { -virtual template T const &get_argument(slot_id slot) const = 0; + virtual template + T const &get_argument(slot_id slot) const = 0; - -virtual template + virtual template privilege_mode_to_accessor get_tensor(slot_id slot) const = 0; -}; -struct LegionTaskArgumentAccessor: public ITaskArgumentAccessor{ + virtual template + std::vector> + get_variadic_tensor(slot_id slot) const = 0; +}; +struct LegionTaskArgumentAccessor : public ITaskArgumentAccessor { public: -virtual template T const &get_argument(slot_id slot) const override; -virtual template privilege_mode_to_accessor get_tensor(slot_id slot) const override; - -// template -// T const &get_argument(slot_id slot) const { -// TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); -// std::type_index actual_type = arg_fmt.type; -// std::type_index requested_type = {typeid(T)}; - -// if (actual_type != requested_type) { -// throw mk_runtime_error( -// "Type mismatch in argument access (\"{}\" != \"{}\")", -// actual_type.name(), -// requested_type.name()); -// } - -// void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; -// Legion::Deserializer dez(start_ptr, arg_fmt.size()); - -// return ff_task_deserialize(dez); -// } - -LegionTaskArgumentAccessor(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime):task(task), regions(regions), ctx(ctx), runtime(runtime){} - -// template privilege_mode_to_accessor -// get_generic_accessor(region_idx_t const &idx) const { -// auto tensor_privs = get_permissions(this->args_fmt, idx); -// if (tensor_privs != PRIV) { -// throw mk_runtime_error( -// "Privilege mismatch while accessing tensor: {} != {}", -// tensor_privs, -// PRIV); -// } - -// return helperGetGenericTensorAccessor( -// get_datatype(this->args_fmt, idx), -// regions[idx.value()], -// task->regions[idx.value()], -// FID_DATA, -// ctx, -// runtime); -// } - -// template -// privilege_mode_to_accessor get_tensor(slot_id slot) const { -// auto argument_format = -// get(this->args_fmt.region_idxs.at(slot)); - -// return this->get_generic_accessor(argument_format); -// } - -// template -// std::vector> -// get_variadic_tensor(slot_id slot) const { -// std::vector> result; - -// auto argument_format = -// get(this->args_fmt.region_idxs.at(slot)); -// for (NonvariadicFormat const &argument : argument_format) { -// result.push_back(this->get_generic_accessor(argument)); -// } - -// return result; -// } + template + T const &get_argument(slot_id slot) const override; + template + privilege_mode_to_accessor get_tensor(slot_id slot) const override; + template + std::vector> + get_variadic_tensor(slot_id slot) const override; + + LegionTaskArgumentAccessor(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime) + : task(task), regions(regions), ctx(ctx), runtime(runtime) {} private: Legion::Task const *task; @@ -159,129 +107,47 @@ LegionTaskArgumentAccessor(Legion::Task const *task, TaskArgumentsFormat const &args_fmt; }; -struct LocalTaskArgumentAccessor: public ITaskArgumentAccessor{ - +struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { public: -virtual template T const &get_argument(slot_id slot) const override; -virtual template privilege_mode_to_accessor get_tensor(slot_id slot) const override; - -LocalTaskArgumentAccessor(std::shared_ptr & sim_task_binding):sim_task_binding(sim_task_binding){ - local_allocator = Allocator::create(); -} - -void * allocate(size_t size); + template + T const &get_argument(slot_id slot) const override; + template + privilege_mode_to_accessor get_tensor(slot_id slot) const override; + template + std::vector> + get_variadic_tensor(slot_id slot) const override; + + LocalTaskArgumentAccessor( + std::shared_ptr &sim_task_binding) + : sim_task_binding(sim_task_binding) { + local_allocator = Allocator::create(); + } -void deallocate(void * ptr); + void *allocate(size_t size); + void deallocate(void *ptr); private: std::shared_ptr sim_task_binding; - // CudaAllocator local_allocator; - Allocator local_allocator; + Allocator local_allocator; }; struct TaskArgumentAccessor { - // TaskArgumentAccessor(Legion::Task const *task, - // std::vector const ®ions, - // Legion::Context ctx, - // Legion::Runtime *runtime); - template + template T const &get_argument(slot_id slot) const { return this->ptr->get_argument(slot); } - - template + template privilege_mode_to_accessor get_tensor(slot_id slot) const { return this->ptr->get_tensor(slot); } + template + std::vector> + get_variadic_tensor(slot_id slot) const { + return this->ptr->get_variadic_tensor(slot); + } - - // template - // T const &get_argument(slot_id slot) const { - // TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); - // std::type_index actual_type = arg_fmt.type; - // std::type_index requested_type = {typeid(T)}; - - // if (actual_type != requested_type) { - // throw mk_runtime_error( - // "Type mismatch in argument access (\"{}\" != \"{}\")", - // actual_type.name(), - // requested_type.name()); - // } - - // void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; - // Legion::Deserializer dez(start_ptr, arg_fmt.size()); - - // return ff_task_deserialize(dez); - // } - - // template - // optional get_optional_argument(slot_id) const { - // NOT_IMPLEMENTED(); - // } - - // template - // std::vector get_variadic_argument(slot_id) const { - // NOT_IMPLEMENTED(); - // } - - // template - // privilege_mode_to_accessor - // get_generic_accessor(region_idx_t const &idx) const { - // auto tensor_privs = get_permissions(this->args_fmt, idx); - // if (tensor_privs != PRIV) { - // throw mk_runtime_error( - // "Privilege mismatch while accessing tensor: {} != {}", - // tensor_privs, - // PRIV); - // } - - // return helperGetGenericTensorAccessor( - // get_datatype(this->args_fmt, idx), - // regions[idx.value()], - // task->regions[idx.value()], - // FID_DATA, - // ctx, - // runtime); - // } - - // template - // privilege_mode_to_accessor get_tensor(slot_id slot) const { - // auto argument_format = - // get(this->args_fmt.region_idxs.at(slot)); - - // return this->get_generic_accessor(argument_format); - // } - - // template - // privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { - // NOT_IMPLEMENTED(); - // } - - // template - // std::vector> - // get_variadic_tensor(slot_id slot) const { - // std::vector> result; - - // auto argument_format = - // get(this->args_fmt.region_idxs.at(slot)); - // for (NonvariadicFormat const &argument : argument_format) { - // result.push_back(this->get_generic_accessor(argument)); - // } - - // return result; - // } - - // template - // std::vector> - // get_variadic_tensor_grad(slot_id slot) const { - // NOT_IMPLEMENTED(); - // } - - // size_t get_device_idx() const { - // NOT_IMPLEMENTED(); - // } template static typename std::enable_if::value, @@ -290,16 +156,11 @@ struct TaskArgumentAccessor { return TaskArgumentAccessor( std::make_shared(std::forward(args)...)); } + private: - // Legion::Task const *task; - // std::vector const ®ions; - // Legion::Context ctx; - // Legion::Runtime *runtime; - // TaskArgumentsFormat const &args_fmt; - //cow_ptr_t const & ITaskArgumentAccesor; - //cow_ptr_t const & ptr; - TaskArgumentAccessor(std::shared_ptr & ptr): ptr(ptr) {} - std::shared_ptr ptr; + TaskArgumentAccessor(std::shared_ptr &ptr) + : ptr(ptr) {} + std::shared_ptr ptr; }; } // namespace FlexFlow From 07ef870e670bd6cd3db0528926867ccfa85c6553 Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 9 Aug 2023 01:59:28 +0000 Subject: [PATCH 08/13] delete the kernels/src/cuda/accessor.cc --- lib/kernels/src/cuda/accessor.cc | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 lib/kernels/src/cuda/accessor.cc diff --git a/lib/kernels/src/cuda/accessor.cc b/lib/kernels/src/cuda/accessor.cc deleted file mode 100644 index 1559c10350..0000000000 --- a/lib/kernels/src/cuda/accessor.cc +++ /dev/null @@ -1,23 +0,0 @@ -#include "kernels/accessor.h" - -namespace FlexFlow { - -int32_t *get_int32_ptr(GenericTensorAccessorW const &w) { - return static_cast( - w.ptr.value()); // Note(lambda):we use static_cast, may have some problem -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &w) { - return static_cast(w.ptr.value()); -} -float *get_float_ptr(GenericTensorAccessorW const &w) { - return static_cast(w.ptr.value()); -} -double *get_double_ptr(GenericTensorAccessorW const &w) { - return static_cast(w.ptr.value()); -} -half *get_half_ptr(GenericTensorAccessorW const &w) { - return static_cast(w.ptr.value()); -} - -} // namespace FlexFlow \ No newline at end of file From 445863d0f9875475c94fd0fd6aa791b4f7c73d6a Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 9 Aug 2023 02:07:06 +0000 Subject: [PATCH 09/13] fix the cuda_allocator error --- lib/runtime/src/cuda_allocator.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/runtime/src/cuda_allocator.cc b/lib/runtime/src/cuda_allocator.cc index 5a4529c8ed..f60bf513ee 100644 --- a/lib/runtime/src/cuda_allocator.cc +++ b/lib/runtime/src/cuda_allocator.cc @@ -1,15 +1,16 @@ #include "cuda_allocator.h" +#include "kernels/device.h" namespace FlexFlow { void *CudaAllocator::allocate(size_t size) { void *ptr; - check_CUDA(cudaMalloc(&ptr, size)); + checkCUDA(cudaMalloc(&ptr, size)); return ptr; } void CudaAllocator::deallocate(void *ptr) { - check_CUDA(cudaFree(ptr)); + checkCUDA(cudaFree(ptr)); } } // namespace FlexFlow From eaf889d29b951ccfedb47b28d33a588fe3b199dd Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Wed, 9 Aug 2023 07:10:56 +0000 Subject: [PATCH 10/13] add API interface for ITaskArgumentAccessor and its subclass --- .../src/task_spec/task_argument_accessor.h | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h index d640fc04a9..448a92d94b 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ b/lib/runtime/src/task_spec/task_argument_accessor.h @@ -81,18 +81,54 @@ struct ITaskArgumentAccessor { virtual template std::vector> get_variadic_tensor(slot_id slot) const = 0; + + virtual template + optional get_optional_argument(slot_id) const = 0; + + virtual template + std::vector get_variadic_argument(slot_id) const = 0; + + virtual template + privilege_mode_to_accessor + get_generic_accessor(region_idx_t const &idx) const = 0; + + virtual template + privilege_mode_to_accessor get_tensor_grad(slot_id slot) const = 0; + + virtual template + std::vector> + get_variadic_tensor_grad(slot_id slot) const = 0; + + virtual size_t get_device_idx() const = 0; }; struct LegionTaskArgumentAccessor : public ITaskArgumentAccessor { public: template T const &get_argument(slot_id slot) const override; + template privilege_mode_to_accessor get_tensor(slot_id slot) const override; + template std::vector> get_variadic_tensor(slot_id slot) const override; + template + optional get_optional_argument(slot_id) const override; + + template + std::vector get_variadic_argument(slot_id) const override; + + template + privilege_mode_to_accessor get_tensor_grad(slot_id slot) const override; + + template + std::vector> + get_variadic_tensor_grad(slot_id slot) const override; + + size_t get_device_idx() const override; + LegionTaskArgumentAccessor(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -111,12 +147,29 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { public: template T const &get_argument(slot_id slot) const override; + template privilege_mode_to_accessor get_tensor(slot_id slot) const override; + template std::vector> get_variadic_tensor(slot_id slot) const override; + template + optional get_optional_argument(slot_id) const override; + + template + std::vector get_variadic_argument(slot_id) const override; + + template + privilege_mode_to_accessor get_tensor_grad(slot_id slot) const override; + + template + std::vector> + get_variadic_tensor_grad(slot_id slot) const override; + + size_t get_device_idx() const override; + LocalTaskArgumentAccessor( std::shared_ptr &sim_task_binding) : sim_task_binding(sim_task_binding) { From e8a80c636d59a928f955f2c99f7590c3393ddc8d Mon Sep 17 00:00:00 2001 From: lambda7xx Date: Thu, 10 Aug 2023 09:11:37 +0000 Subject: [PATCH 11/13] add memory usage track for LocalTaskArgumentAccessor --- .../src/task_spec/task_argument_accessor.cc | 20 ++++++++++--------- .../src/task_spec/task_argument_accessor.h | 7 ++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index 55d66b035a..c74685d662 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -17,12 +17,13 @@ using namespace FlexFlow { } void *LocalTaskArgumentAccessor::allocate(size_t size) { - void *ptr = local_allocator.allocate( - memory_size); // Note: how(when) to free this memory? - void *cpu_ptr = malloc(memory_size); - memset(cpu_ptr, 0, memory_size); - checkCUDA(cudaMemcpy( - ptr, cpu_ptr, memory_size, cudaMemcpyHostToDevice)); // fill ptr + void *ptr = + local_allocator.allocate(size); // Note: how(when) to free this memory? + void *cpu_ptr = malloc(size); + memory_usage += size; // update the usage of memory + memset(cpu_ptr, 0, size); + checkCUDA( + cudaMemcpy(ptr, cpu_ptr, size, cudaMemcpyHostToDevice)); // fill ptr free(cpu_ptr); return ptr; } @@ -40,7 +41,7 @@ using namespace FlexFlow { // return std::vector size_t shape_size = gate_preds.shape.dims.get_volume() * size_of(shape); void *ptr = allocate(shape_size); - return gate_preds_accessor{shape, array_shape, ptr}; + return gate_preds_accessor{data_type, array_shape, ptr}; } else if (slot == OUTPUT) { ParallelTensorShape output_shape = get( this->sim_task_binding->tensor_shape_bindings.at(slot)); @@ -50,7 +51,7 @@ using namespace FlexFlow { // std::vector size_t shape_size = output_shape.dims.get_volume() * size_of(data_type); void *ptr = allocate(shape_size); - return {shape, array_shape, ptr}; + return {data_type, array_shape, ptr}; } else { throw mk_runtime_error( "Unknown Slot ID in LocalTaskArgumentAccessor::get_tensor"); @@ -70,7 +71,8 @@ using namespace FlexFlow { .get_dims()}; // shape.dims.get_dims() return std::vector size_t shape_size = shape.dims.get_volume() * size_of(shape.data_type); void *ptr = allocate(shape_size); - result.push_back({shape, array_shape, ptr}); + DataType data_type = shape.data_type; + result.push_back({data_type, array_shape, ptr}); } return result; } diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h index 448a92d94b..f064aa4b9a 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ b/lib/runtime/src/task_spec/task_argument_accessor.h @@ -172,16 +172,21 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { LocalTaskArgumentAccessor( std::shared_ptr &sim_task_binding) - : sim_task_binding(sim_task_binding) { + : sim_task_binding(sim_task_binding), memory_usage(0) { local_allocator = Allocator::create(); } + size_t get_memory_usage() const { + return memory_usage; + } + void *allocate(size_t size); void deallocate(void *ptr); private: std::shared_ptr sim_task_binding; Allocator local_allocator; + size_t memory_usage; }; struct TaskArgumentAccessor { From f39178221866db84db770aa6a5cd69a66880888b Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 21 Aug 2023 09:07:11 -0700 Subject: [PATCH 12/13] Pull in formatting code from repo-refactor --- .clang-format-for-format-sh | 182 +++++++++++++++++++++++ .github/workflows/clang-format-check.yml | 20 ++- scripts/format.sh | 11 +- 3 files changed, 202 insertions(+), 11 deletions(-) create mode 100644 .clang-format-for-format-sh diff --git a/.clang-format-for-format-sh b/.clang-format-for-format-sh new file mode 100644 index 0000000000..17e9f8935d --- /dev/null +++ b/.clang-format-for-format-sh @@ -0,0 +1,182 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +AttributeMacros: + - __capability +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: true +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseLabels: true +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentRequires: false +IndentWidth: 2 +IndentWrappedFunctionNames: true +InsertBraces: true +InsertTrailingCommas: None +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 0 +PointerAlignment: Right +PPIndentWidth: -1 +QualifierAlignment: Right +ReferenceAlignment: Pointer +ReflowComments: true +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceAroundPointerQualifiers: Default +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +WhitespaceSensitiveMacros: + - STRINGIZE + - PP_STRINGIZE + - BOOST_PP_STRINGIZE + - NS_SWIFT_NAME + - CF_SWIFT_NAME +... + diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 46c9bf3be2..fb93fd6b5b 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -5,21 +5,25 @@ jobs: name: Formatting Check runs-on: ubuntu-latest strategy: + fail-fast: false matrix: path: - - check: "src" - exclude: '\.proto$' - - check: "include" - - check: "nmt" - - check: "python" - - check: "scripts" + - check: "lib/compiler" + - check: "lib/ffi" + - check: "lib/kernels" + - check: "lib/op-attrs" + - check: "lib/pcg" + - check: "lib/runtime" + - check: "lib/substitutions" + - check: "lib/utils" - check: "tests" - check: "examples" + - check: "bindings" steps: - uses: actions/checkout@v2 - name: Run clang-format style check for C/C++/Protobuf programs. - uses: jidicula/clang-format-action@v4.8.0 + uses: lockshaw/clang-format-action@v4.11.0-flexflow-3 with: - clang-format-version: "15" + clang-format-version: "16" check-path: ${{ matrix.path['check'] }} exclude-regex: ${{ matrix.path['exclude'] }} diff --git a/scripts/format.sh b/scripts/format.sh index 2ed97b8f0a..9610dc2d26 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -6,8 +6,8 @@ GIT_ROOT="$(git rev-parse --show-toplevel)" cd "$GIT_ROOT" TOOLS_PATH="$GIT_ROOT/.tools" -RELEASE="master-1d7ec53d" -CLANG_FORMAT_VERSION="15" +RELEASE="master-f4f85437" +CLANG_FORMAT_VERSION="16" CLANG_FORMAT_PATH="$TOOLS_PATH/clang-format-$CLANG_FORMAT_VERSION-$RELEASE" mkdir -p "$TOOLS_PATH" @@ -68,5 +68,10 @@ if [[ ! -e $CLANG_FORMAT_PATH ]]; then chmod u+x "$CLANG_FORMAT_PATH" fi +CLANG_FORMAT_CONFIG="$GIT_ROOT/.clang-format-for-format-sh" mapfile -t FILES < <(git ls-files ':!:triton/**' '*.h' '*.cc' '*.cpp' '*.cu' '*.c') -"$CLANG_FORMAT_PATH" -i "${FILES[@]}" +if [[ -f $CLANG_FORMAT_CONFIG ]]; then + "$CLANG_FORMAT_PATH" --style=file:"$CLANG_FORMAT_CONFIG" -i "${FILES[@]}" +else + echo "error" +fi From 6469d44e29fad328b873fec9864e193dc11fb354 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Mon, 21 Aug 2023 09:07:37 -0700 Subject: [PATCH 13/13] Format --- examples/cpp/DLRM/strategies/dlrm_strategy.cc | 2 +- lib/compiler/include/compiler/machine_mapping.h | 2 +- lib/compiler/test/test_dp.cc | 2 +- lib/compiler/test/test_open_graph.cc | 2 +- lib/kernels/include/kernels/reshape_kernels.h | 2 +- lib/kernels/src/allocation.cc | 2 +- lib/kernels/src/hip/aggregate_kernels.cpp | 2 +- lib/kernels/src/hip/optimizer_kernel.cpp | 2 +- lib/op-attrs/src/datatype.cc | 2 +- lib/runtime/src/cuda_allocator.h | 2 +- lib/runtime/src/sim_environment.cc | 2 +- lib/runtime/src/task_spec/task_argument_accessor.cc | 2 +- lib/runtime/test/src/test_op_task_spec.cc | 2 +- lib/runtime/test/src/test_serialization.cc | 2 +- lib/utils/include/utils/containers.h | 2 +- lib/utils/include/utils/internal_only_tag.h | 2 +- lib/utils/include/utils/visitable.h | 2 +- lib/utils/test/src/test_algorithms.cc | 2 +- python/flexflow_c.cc | 4 +++- 19 files changed, 21 insertions(+), 19 deletions(-) diff --git a/examples/cpp/DLRM/strategies/dlrm_strategy.cc b/examples/cpp/DLRM/strategies/dlrm_strategy.cc index 2fcc4173c9..a7fab8c3a8 100644 --- a/examples/cpp/DLRM/strategies/dlrm_strategy.cc +++ b/examples/cpp/DLRM/strategies/dlrm_strategy.cc @@ -356,4 +356,4 @@ int main(int argc, char **argv) { std::to_string(gpu) + "gpus.pb"; std::fstream outputFile(output.c_str(), std::ios::out | std::ios::trunc); strategy.SerializeToOstream(&outputFile); */ -} \ No newline at end of file +} diff --git a/lib/compiler/include/compiler/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping.h index c105221682..400e2770f8 100644 --- a/lib/compiler/include/compiler/machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping.h @@ -39,4 +39,4 @@ MachineMapping optimal_cost( MAKE_VISIT_HASHABLE(::FlexFlow::MachineMapping); -#endif \ No newline at end of file +#endif diff --git a/lib/compiler/test/test_dp.cc b/lib/compiler/test/test_dp.cc index 18b098a202..01e4189839 100644 --- a/lib/compiler/test/test_dp.cc +++ b/lib/compiler/test/test_dp.cc @@ -51,4 +51,4 @@ TEST_CASE("optimal_cost") { optimal_cost(pcg, allowed_machine_views, TestCostEstimator{}, resource); // TODO: check result -} \ No newline at end of file +} diff --git a/lib/compiler/test/test_open_graph.cc b/lib/compiler/test/test_open_graph.cc index 6288b481fe..66af736a50 100644 --- a/lib/compiler/test/test_open_graph.cc +++ b/lib/compiler/test/test_open_graph.cc @@ -102,4 +102,4 @@ TEST_CASE("get_cut") { GraphSplit gs1{{ns[0], ns[1], ns[2], ns[3]}, {ns[4]}}; CHECK(get_cut(g, gs1) == std::unordered_set{e3, e4}); -} \ No newline at end of file +} diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index fa752b6c2b..972f8ee9b2 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -30,4 +30,4 @@ void backward_kernel(ffStream_t stream, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index eb90b0cb69..fa32a9a705 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -10,4 +10,4 @@ void Allocator::deallocate(void *ptr) { i_allocator->deallocate(ptr); } -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/kernels/src/hip/aggregate_kernels.cpp b/lib/kernels/src/hip/aggregate_kernels.cpp index ff50a8c7ad..40faaaeeff 100644 --- a/lib/kernels/src/hip/aggregate_kernels.cpp +++ b/lib/kernels/src/hip/aggregate_kernels.cpp @@ -296,4 +296,4 @@ AggregatePerDeviceState::~AggregatePerDeviceState(void) { } // namespace Aggregate } // namespace Kernels -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/kernels/src/hip/optimizer_kernel.cpp b/lib/kernels/src/hip/optimizer_kernel.cpp index 7f57d6a2fb..c22ecd7f5a 100644 --- a/lib/kernels/src/hip/optimizer_kernel.cpp +++ b/lib/kernels/src/hip/optimizer_kernel.cpp @@ -245,4 +245,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, } #endif -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/lib/op-attrs/src/datatype.cc b/lib/op-attrs/src/datatype.cc index fc47ef47eb..6fb2d41051 100644 --- a/lib/op-attrs/src/datatype.cc +++ b/lib/op-attrs/src/datatype.cc @@ -21,4 +21,4 @@ size_t size_of(DataType data_type) { } } -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/runtime/src/cuda_allocator.h b/lib/runtime/src/cuda_allocator.h index cb3c8e1c19..94f47ba035 100644 --- a/lib/runtime/src/cuda_allocator.h +++ b/lib/runtime/src/cuda_allocator.h @@ -15,4 +15,4 @@ struct CudaAllocator : public IAllocator { } // namespace FlexFlow -#endif \ No newline at end of file +#endif diff --git a/lib/runtime/src/sim_environment.cc b/lib/runtime/src/sim_environment.cc index 8233735396..8c599ca2e5 100644 --- a/lib/runtime/src/sim_environment.cc +++ b/lib/runtime/src/sim_environment.cc @@ -23,4 +23,4 @@ namespaec FlexFlow { NOT_IMPLEMENTED(); // TODO } -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/runtime/src/task_spec/task_argument_accessor.cc b/lib/runtime/src/task_spec/task_argument_accessor.cc index c74685d662..26ad9e3016 100644 --- a/lib/runtime/src/task_spec/task_argument_accessor.cc +++ b/lib/runtime/src/task_spec/task_argument_accessor.cc @@ -77,4 +77,4 @@ using namespace FlexFlow { return result; } -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/runtime/test/src/test_op_task_spec.cc b/lib/runtime/test/src/test_op_task_spec.cc index 821ef0dba6..bb0bee567c 100644 --- a/lib/runtime/test/src/test_op_task_spec.cc +++ b/lib/runtime/test/src/test_op_task_spec.cc @@ -44,4 +44,4 @@ TEST_CASE("OpTaskBinding") { correct_bwd.bind_grad(2, input_tensor(2).grad()); CHECK(correct_bwd == bwd); -} \ No newline at end of file +} diff --git a/lib/runtime/test/src/test_serialization.cc b/lib/runtime/test/src/test_serialization.cc index caf5cd7c93..ef18764efb 100644 --- a/lib/runtime/test/src/test_serialization.cc +++ b/lib/runtime/test/src/test_serialization.cc @@ -44,4 +44,4 @@ TEST_CASE("Serialization") { RC_ASSERT(post_op == pre_op); })) } -} \ No newline at end of file +} diff --git a/lib/utils/include/utils/containers.h b/lib/utils/include/utils/containers.h index df156c9060..236e4b6eac 100644 --- a/lib/utils/include/utils/containers.h +++ b/lib/utils/include/utils/containers.h @@ -714,4 +714,4 @@ reversed_container_t reversed_container(C const &c) { } // namespace FlexFlow -#endif \ No newline at end of file +#endif diff --git a/lib/utils/include/utils/internal_only_tag.h b/lib/utils/include/utils/internal_only_tag.h index 649ce4cf12..1e5f8571d0 100644 --- a/lib/utils/include/utils/internal_only_tag.h +++ b/lib/utils/include/utils/internal_only_tag.h @@ -7,4 +7,4 @@ struct should_only_be_used_internally_tag_t { }; } // namespace FlexFlow -#endif \ No newline at end of file +#endif diff --git a/lib/utils/include/utils/visitable.h b/lib/utils/include/utils/visitable.h index 4f0dc50cbe..6a671400cd 100644 --- a/lib/utils/include/utils/visitable.h +++ b/lib/utils/include/utils/visitable.h @@ -457,4 +457,4 @@ struct Arbitrary< _GET_VISITABLE_CASE_FROM_NUM_ARGS(__VA_ARGS__), \ __VA_ARGS__) -#endif \ No newline at end of file +#endif diff --git a/lib/utils/test/src/test_algorithms.cc b/lib/utils/test/src/test_algorithms.cc index 35534f5b3a..7748bd3ff0 100644 --- a/lib/utils/test/src/test_algorithms.cc +++ b/lib/utils/test/src/test_algorithms.cc @@ -220,4 +220,4 @@ TEST_CASE("get_weakly_connected_components") { }; CHECK(get_weakly_connected_components(g) == expected_components); -} \ No newline at end of file +} diff --git a/python/flexflow_c.cc b/python/flexflow_c.cc index 4c16dd82e9..1d3d77afe5 100644 --- a/python/flexflow_c.cc +++ b/python/flexflow_c.cc @@ -33,7 +33,9 @@ class FFCObjectWrapper { t_.impl = const_cast(static_cast(t)); \ return t_; \ } \ - static T unwrap(T_ t_) { return static_cast(t_.impl); } \ + static T unwrap(T_ t_) { \ + return static_cast(t_.impl); \ + } \ static const T unwrap_const(const T_ t_) { \ return static_cast(t_.impl); \ }