diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 36796bc504..15f14f8757 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -42,7 +42,7 @@ struct ArrayShape { ArrayShape reversed_dim_order() const; ArrayShape sub_shape(std::optional start, - std::optional end); + std::optional end) const; public: LegionTensorDims dims; diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 17e0048c65..dedfbb01ef 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -9,9 +9,6 @@ namespace FlexFlow { -using ElementUnaryUnifiedAttrs = - std::variant; - struct ElementUnaryPerDeviceState { ffTensorDescriptor_t inputTensor, outputTensor; req actiDesc; diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index c74f9c0bb6..13bf4b898a 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -2,36 +2,34 @@ #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H #include "accessor.h" -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { struct GatherPerDeviceState { - int legion_dim; - req index_data_type; + PerDeviceFFHandle handle; + legion_dim_t legion_dim; }; + FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, - legion_dim, - index_data_type); + handle, + legion_dim); namespace Kernels { namespace Gather { + void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &output, - size_t stride, - size_t input_dim_size, - size_t output_dim_size); + GenericTensorAccessorW const &output); + void backward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad, - size_t stride, - size_t input_dim_size, - size_t output_dim_size); + GenericTensorAccessorW const &input_grad); + } // namespace Gather } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 37d0112eab..286acf7376 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -25,10 +25,10 @@ template __global__ void gather_forward(float const *input, IndexType const *index, float *output, - size_t output_size, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { CUDA_KERNEL_LOOP(o, output_size) { // output tensor shape: [*, output_dim_size, stride] // output tensor stride: [output_dim_size * stride, stride, 1] @@ -39,10 +39,10 @@ __global__ void gather_forward(float const *input, // [outer_index, index[0], left_over] // Therefore, input_index = outer_index * (stride * input_dim_size) // + index[0] * stride + left_over; - size_t outer_index = o / (stride * output_dim_size); + coord_t outer_index = o / (stride * output_dim_size); // coord_t index_2 = (o / stride) % dim_size - size_t left_over = o % stride; - size_t input_idx = + coord_t left_over = o % stride; + coord_t input_idx = outer_index * (stride * input_dim_size) + index[o] * stride + left_over; output[o] = input[input_idx]; } @@ -52,10 +52,10 @@ template __global__ void gather_backward(float const *output_grad, IndexType const *index, float *input_grad, - size_t output_size, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { CUDA_KERNEL_LOOP(o, output_size) { // output tensor shape: [*, output_dim_size, stride] // output tensor stride: [output_dim_size * stride, stride, 1] @@ -66,10 +66,10 @@ __global__ void gather_backward(float const *output_grad, // [outer_index, index[0], left_over] // Therefore, input_index = outer_index * (stride * input_dim_size) // + index[0] * stride + left_over; - size_t outer_index = o / (stride * output_dim_size); + coord_t outer_index = o / (stride * output_dim_size); // coord_t index_2 = (o / stride) % dim_size - size_t left_over = o % stride; - size_t input_idx = + coord_t left_over = o % stride; + coord_t input_idx = outer_index * (stride * input_dim_size) + index[o] * stride + left_over; atomicAdd(&input_grad[input_idx], output_grad[o]); @@ -78,100 +78,97 @@ __global__ void gather_backward(float const *output_grad, template struct ForwardKernel { - void operator()(cudaStream_t stream, - GatherPerDeviceState const &m, + void operator()(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - /*size_t stride = 1; - for (int i = 0; i < m->legion_dim; i++) { - stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1); - } - size_t dim_size = - output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] + - 1; -*/ - gather_forward> - <<>>(input.get(), - index.get(), - output.get(), - output.shape.get_volume(), - stride, - input_dim_size, - output_dim_size); + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { + gather_forward<<>>( + input.get_float_ptr(), + index.get(), + output.get_float_ptr(), + output_size, + stride, + input_dim_size, + output_dim_size); } }; -void forward_kernel(cudaStream_t stream, +template +struct BackwardKernel { + void operator()(ffStream_t stream, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad, + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { + gather_backward<<>>( + output_grad.get_float_ptr(), + index.get(), + input_grad.get_float_ptr(), + output_size, + stride, + input_dim_size, + output_dim_size); + } +}; + +void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &output, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - DataTypeDispatch1{}(m.index_data_type, + GenericTensorAccessorW const &output) { + checkCUDA(get_legion_stream(&stream)); + + coord_t stride = + output.shape + .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1}) + .get_volume(); + coord_t output_dim_size = output.shape[m.legion_dim]; + coord_t input_dim_size = input.shape[m.legion_dim]; + + assert(index.data_type == DataType::INT32 || + index.data_type == DataType::INT64); + + DataTypeDispatch1{}(index.data_type, stream, - m, input, index, output, + output.shape.get_volume(), stride, input_dim_size, output_dim_size); } -template -struct BackwardKernel { - void operator()(cudaStream_t stream, - GatherPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - /*size_t stride = 1; - for (int i = 0; i < m->legion_dim; i++) { - stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1); - } - size_t dim_size = output_grad.domain.hi()[m->legion_dim] - - output_grad.domain.lo()[m->legion_dim] + 1; - */ - gather_backward> - <<>>(output_grad.get(), - index.get(), - input_grad.get(), - output_grad.shape.get_volume(), - stride, - input_dim_size, - output_dim_size); - } -}; - -void backward_kernel(cudaStream_t stream, +void backward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - DataTypeDispatch1{}(m.index_data_type, + GenericTensorAccessorW const &input_grad) { + checkCUDA(get_legion_stream(&stream)); + + coord_t stride = + output_grad.shape + .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1}) + .get_volume(); + coord_t output_dim_size = output_grad.shape[m.legion_dim]; + coord_t input_dim_size = input_grad.shape[m.legion_dim]; + + assert(index.data_type == DataType::INT32 || + index.data_type == DataType::INT64); + + DataTypeDispatch1{}(index.data_type, stream, - m, output_grad, index, input_grad, + output_grad.shape.get_volume(), stride, input_dim_size, output_dim_size); diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt index ee1d8fecdc..6b432fad75 100644 --- a/lib/local-execution/CMakeLists.txt +++ b/lib/local-execution/CMakeLists.txt @@ -12,4 +12,5 @@ ff_add_library( utils kernels pcg + spdlog ) \ No newline at end of file diff --git a/lib/runtime/src/task_spec/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h similarity index 50% rename from lib/runtime/src/task_spec/arg_ref.h rename to lib/local-execution/include/local-execution/arg_ref.h index 62f89f0b5c..50fe4e6f80 100644 --- a/lib/runtime/src/task_spec/arg_ref.h +++ b/lib/local-execution/include/local-execution/arg_ref.h @@ -1,9 +1,9 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_ARG_REF_H -#define _FLEXFLOW_RUNTIME_SRC_ARG_REF_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H #include "kernels/ff_handle.h" -#include "runtime/profiling.h" -#include "runtime/task_spec/arg_type_runtime_tag.h" +#include "local-execution/profiling.h" +#include "local-execution/serialization.h" #include "utils/type_index.h" #include "utils/visitable.h" @@ -21,37 +21,43 @@ struct ArgRefSpec { template bool holds() const { - return this->type_tag.template matches(); + // return this->type_tag.template matches(); + + return matches(this->type_idx); } LABEL_TYPE const &get_ref_type() const { return this->ref_type; } - ArgTypeRuntimeTag get_type_tag() const { - return this->type_tag; + // TODO - how to extend this for legion runtime? + // ArgTypeRuntimeTag get_type_tag() const { + // return this->type_tag; + // } + std::type_index get_type_index() const { + return this->type_idx; } template static ArgRefSpec create(ArgRef const &r) { static_assert(is_serializable::value, "Type must be serializeable"); - return ArgRefSpec(ArgTypeRuntimeTag::create(), r.ref_type); + return ArgRefSpec(get_type_index_for_type(), r.ref_type); } template static ArgRefSpec create_device_specific(ArgRef const &r, size_t device_idx) { - return ArgRefSpec(ArgTypeRuntimeTag::create(), r.ref_type, device_idx); + return ArgRefSpec(get_type_index_for_type(), r.ref_type, device_idx); } private: - ArgRefSpec(ArgTypeRuntimeTag const &type_tag, LABEL_TYPE ref_type) - : type_tag(type_tag), ref_type(ref_type) {} + ArgRefSpec(std::type_index const &type_index, LABEL_TYPE ref_type) + : type_idx(type_index), ref_type(ref_type) {} - ArgTypeRuntimeTag type_tag; + std::type_index type_idx; LABEL_TYPE ref_type; - optional device_idx = nullopt; + std::optional device_idx = std::nullopt; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h new file mode 100644 index 0000000000..2db5e45e9e --- /dev/null +++ b/lib/local-execution/include/local-execution/concrete_arg.h @@ -0,0 +1,55 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H +#define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H + +#include "local-execution/serialization.h" +#include "utils/type_index.h" +#include + +namespace FlexFlow { + +struct ConcreteArgSpec { +public: + ConcreteArgSpec() = delete; + + template + T const &get() const { + assert(matches(this->type_idx)); + + return *(T const *)ptr.get(); + } + + // ArgTypeRuntimeTag get_type_tag() const { + // return this->type_tag; + // } + // size_t serialize(Legion::Serializer &) const; + + std::type_index get_type_index() const { + return this->type_idx; + } + + template + static ConcreteArgSpec create(T const &t) { + static_assert(is_serializable::value, "Type must be serializable"); + + std::type_index type_idx = get_type_index_for_type(); + std::shared_ptr ptr = + std::static_pointer_cast(std::make_shared(t)); + + return ConcreteArgSpec(type_idx, ptr); + // ArgTypeRuntimeTag::create()); + } + +private: + ConcreteArgSpec(std::type_index const &type_index, + std::shared_ptr ptr) + : type_idx(type_index), ptr(ptr) {} + // ArgTypeRuntimeTag const &); + + // ArgTypeRuntimeTag type_tag; + std::type_index type_idx; + std::shared_ptr ptr; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/include/runtime/config.h b/lib/local-execution/include/local-execution/config.h similarity index 89% rename from lib/runtime/include/runtime/config.h rename to lib/local-execution/include/local-execution/config.h index 34f45040d1..73653aebae 100644 --- a/lib/runtime/include/runtime/config.h +++ b/lib/local-execution/include/local-execution/config.h @@ -13,12 +13,11 @@ * limitations under the License. */ -#ifndef _FLEXFLOW_CONFIG_H_ -#define _FLEXFLOW_CONFIG_H_ -#include "legion.h" +#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_ +#define _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_ + #include "op-attrs/param_sync.h" #include "utils/fmt.h" -#include "utils/optional.h" #include "utils/visitable.h" #include @@ -47,6 +46,8 @@ struct FFInitInfo : public use_visitable_cmp { bool allowTensorOpMathConversion; }; +using legion_mapping_tag_id_t = unsigned long; + struct FFConfig : public use_visitable_cmp { public: enum PreservedIDs { @@ -64,7 +65,7 @@ struct FFConfig : public use_visitable_cmp { }; FFConfig() = default; - static Legion::MappingTagID get_hash_id(std::string const &pcname); + static legion_mapping_tag_id_t get_hash_id(std::string const &pcname); public: int epochs = 1; @@ -88,16 +89,17 @@ struct FFConfig : public use_visitable_cmp { bool enable_inplace_optimizations = false; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion = false; - optional dataset_path = nullopt; - optional export_strategy_computation_graph_file = nullopt; + std::optional dataset_path = std::nullopt; + std::optional export_strategy_computation_graph_file = + std::nullopt; bool include_costs_dot_graph = false; - optional substitution_json_path = nullopt; + std::optional substitution_json_path = std::nullopt; int machine_model_version = 0; - optional machine_model_file = nullopt; + std::optional machine_model_file = std::nullopt; int simulator_segment_size = 16777216; // 16 MB int simulator_max_num_segments = 1; - optional search_num_nodes = nullopt; - optional search_num_workers = nullopt; + std::optional search_num_nodes = std::nullopt; + std::optional search_num_workers = std::nullopt; int base_optimize_threshold = 10; bool enable_control_replication = true; // The default python data loader type is 2 to enable control replication diff --git a/lib/runtime/src/cost_metrics.h b/lib/local-execution/include/local-execution/cost_metrics.h similarity index 95% rename from lib/runtime/src/cost_metrics.h rename to lib/local-execution/include/local-execution/cost_metrics.h index 77526ccd1a..edc0190daf 100644 --- a/lib/runtime/src/cost_metrics.h +++ b/lib/local-execution/include/local-execution/cost_metrics.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H -#define _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H +#define _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H #include "utils/visitable.h" diff --git a/lib/runtime/src/task_spec/device_specific.h b/lib/local-execution/include/local-execution/device_specific.h similarity index 64% rename from lib/runtime/src/task_spec/device_specific.h rename to lib/local-execution/include/local-execution/device_specific.h index e29e4e9450..6136d16f2d 100644 --- a/lib/runtime/src/task_spec/device_specific.h +++ b/lib/local-execution/include/local-execution/device_specific.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H -#define _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H +#define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H -#include "serialization.h" +#include "local-execution/serialization.h" #include "utils/exception.h" namespace FlexFlow { @@ -10,10 +10,17 @@ template struct DeviceSpecific { DeviceSpecific() = delete; + DeviceSpecific(T ptr_type) { // accessor + size_t device_idx = 0; + DeviceSpecific device_specific = + DeviceSpecific::create(device_idx, ptr_type); + this->ptr = device_specific.ptr; + this->device_idx = device_specific.device_idx; + } template static DeviceSpecific create(size_t device_idx, Args &&...args) { - NOT_IMPLEMENTED(); + NOT_IMPLEMENTED(); // accessor } T const *get(size_t curr_device_idx) const { @@ -26,6 +33,8 @@ struct DeviceSpecific { return this->ptr; } + // TODO: can modify ptr + private: T *ptr; size_t device_idx; diff --git a/lib/runtime/src/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h similarity index 92% rename from lib/runtime/src/legion_tensor_shape.h rename to lib/local-execution/include/local-execution/legion_tensor_shape.h index 1f5fab76a6..ff96ba9a15 100644 --- a/lib/runtime/src/legion_tensor_shape.h +++ b/lib/local-execution/include/local-execution/legion_tensor_shape.h @@ -28,8 +28,8 @@ struct LegionTensorShape : public use_visitable_cmp, DataType data_type; }; -ff_dim_t to_ff(legion_dim_t, int num_dims); -legion_dim_t to_legion(ff_dim_t, int num_dims); +ff_dim_t to_ff(legion_dim_t, size_t num_dims); +legion_dim_t to_legion(ff_dim_t, size_t num_dims); ff_dim_t to_ff(legion_dim_t, TensorShape const &); legion_dim_t to_legion(ff_dim_t, TensorShape const &); diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local-execution/local_allocator.h similarity index 82% rename from lib/local-execution/include/local_allocator.h rename to lib/local-execution/include/local-execution/local_allocator.h index f4b253b281..b47220eb8c 100644 --- a/lib/local-execution/include/local_allocator.h +++ b/lib/local-execution/include/local-execution/local_allocator.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H -#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H #include "kernels/allocation.h" #include diff --git a/lib/runtime/src/task_spec/op_arg_ref.h b/lib/local-execution/include/local-execution/op_arg_ref.h similarity index 54% rename from lib/runtime/src/task_spec/op_arg_ref.h rename to lib/local-execution/include/local-execution/op_arg_ref.h index 3e931d79a4..1650656b42 100644 --- a/lib/runtime/src/task_spec/op_arg_ref.h +++ b/lib/local-execution/include/local-execution/op_arg_ref.h @@ -1,8 +1,8 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H -#include "arg_ref.h" -#include "device_specific.h" +#include "local-execution/arg_ref.h" +#include "local-execution/device_specific.h" #include "op-attrs/parallel_tensor_shape.h" namespace FlexFlow { @@ -15,13 +15,9 @@ using OpArgRef = ArgRef; using OpArgRefSpec = ArgRefSpec; template -OpArgRef> per_device_op_state() { - return {OpArgRefType::PER_DEVICE_OP_STATE}; -} +OpArgRef> per_device_op_state(); -OpArgRef input_parallel_tensor_shape(int idx) { - return {OpArgRefType::PARALLEL_TENSOR_SHAPE}; -} +OpArgRef input_parallel_tensor_shape(int idx); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h new file mode 100644 index 0000000000..37ca5c239d --- /dev/null +++ b/lib/local-execution/include/local-execution/op_task_invocation.h @@ -0,0 +1,97 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H + +#include "kernels/accessor.h" +#include "local-execution/concrete_arg.h" +#include "local-execution/op_arg_ref.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/op_tensor_spec.h" +#include "local-execution/profiling.h" +#include "local-execution/runtime_arg_ref.h" +#include "local-execution/tasks.h" +#include "local-execution/variadic_tensor_ref.h" +#include "utils/bidict.h" +#include "utils/stack_map.h" +#include +#include +#include +#include + +namespace FlexFlow { + +enum class IsTrainable { YES, NO }; + +using OpArgSpec = + std::variant; + +struct OpTaskBinding { + OpTaskBinding() = default; + + void bind(slot_id, VariadicTensorRef const &) { + NOT_IMPLEMENTED(); + } + void bind(slot_id, OpTensorSpec const &); + void bind_grad(slot_id, OpTensorSpec const &); + + template + void bind_device_specific_arg(slot_id name, T const &t) { + NOT_IMPLEMENTED(); + } + + template + void bind_device_specific_arg(slot_id name, OpArgRef const &t) { + NOT_IMPLEMENTED(); + } + + template + void bind_arg(slot_id name, T const &t) { + this->insert_arg_spec(name, ConcreteArgSpec::create(t)); + } + + template + void bind_arg(slot_id name, RuntimeArgRef const &ref) { + this->insert_arg_spec(name, RuntimeArgRefSpec::create(ref)); + } + + template + void bind_arg(slot_id name, OpArgRef const &ref) { + this->insert_arg_spec(name, OpArgRefSpec::create(ref)); + } + + std::unordered_map, OpTensorSpec> const & + get_tensor_bindings() const; + std::unordered_map const &get_arg_bindings() const; + + void insert_arg_spec(slot_id name, OpArgSpec const &arg_spec) { + assert(!contains_key(this->arg_bindings, name)); + this->arg_bindings.insert({name, arg_spec}); + } + + std::unordered_map arg_bindings; + std::unordered_map, OpTensorSpec> tensor_bindings; +}; +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskBinding, + arg_bindings, + tensor_bindings); + +struct OpTaskInvocation { +public: + OpTaskInvocation() = delete; + OpTaskInvocation(task_id_t const &task_id, OpTaskBinding const &binding) + : task_id(task_id), binding(binding) {} + +public: + task_id_t task_id; + OpTaskBinding binding; +}; +FF_VISITABLE_STRUCT(OpTaskInvocation, task_id, binding); + +OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd); +OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); + +bool is_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv); + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/src/task_spec/op_task_signature.h b/lib/local-execution/include/local-execution/op_task_signature.h similarity index 73% rename from lib/runtime/src/task_spec/op_task_signature.h rename to lib/local-execution/include/local-execution/op_task_signature.h index 656df39309..3bcb8397b7 100644 --- a/lib/runtime/src/task_spec/op_task_signature.h +++ b/lib/local-execution/include/local-execution/op_task_signature.h @@ -1,8 +1,11 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H -#define _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H - -#include "task_invocation.h" -#include "task_signature.h" +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H + +#include "local-execution/serialization.h" +#include "local-execution/slot_id.h" +#include "local-execution/slot_type.h" +#include "local-execution/tasks.h" +#include "utils/type_index.h" #include "utils/visitable.h" namespace FlexFlow { @@ -14,6 +17,7 @@ enum class TensorRole { }; enum class OpTaskType { INIT, FWD, BWD }; +enum class IsGrad { YES, NO }; enum class OpSlotOptions { OPTIONAL, @@ -25,7 +29,6 @@ enum class OpSlotOptions { struct OpTensorSlotSpec { public: OpTensorSlotSpec() = delete; - OpTensorSlotSpec(slot_id, SlotType, TensorRole); public: slot_id name; @@ -41,7 +44,9 @@ struct OpTaskSignature { OpTaskSignature() = delete; explicit OpTaskSignature(OpTaskType); - OpTaskType get_task_type() const; + OpTaskType get_task_type() const { + return this->type; + } void add_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR); void add_optional_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR); @@ -59,45 +64,35 @@ struct OpTaskSignature { void add_from_slot_spec(OpTensorSlotSpec const &spec); - /* void add_input_slot(slot_id, Legion::PrivilegeMode); */ - /* void add_input_slot(slot_id, SlotType, Legion::PrivilegeMode); */ - - bool operator==(OpTaskSignature const &) const; - bool operator!=(OpTaskSignature const &) const; - template void add_arg_slot(slot_id name) { static_assert(is_serializable::value, "Type must be serializable"); + this->task_arg_types.insert({name, get_type_index_for_type()}); } template - void add_return_value(); + void add_return_value() { + this->return_value = get_type_index_for_type(); + } // adds arg_slot without checking is_serializable, used for arguments that are // deviceSpecific template void add_unchecked_arg_slot(slot_id name) { - NOT_IMPLEMENTED(); + this->task_arg_types.insert({name, get_type_index_for_type()}); } - std::unordered_set get_tensor_slots(); + std::unordered_set get_tensor_slots() const; void set_arg_types(std::unordered_map const &); - std::unordered_map get_arg_types(); + std::unordered_map get_arg_types() const; -private: + OpTaskType type; + std::optional return_value; std::unordered_map task_arg_types; std::unordered_set op_tensor_slots; }; - -template -OpTaskSignature init_signature(); -template -OpTaskSignature fwd_signature(); -template -OpTaskSignature bwd_signature(); - -template -OpTaskSignature get_signature(); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + OpTaskSignature, type, return_value, task_arg_types, op_tensor_slots); template void register_task(task_id_t, @@ -112,6 +107,15 @@ void register_task(task_id_t, F const &func, F const &cpu_func); +template +OpTaskSignature init_signature(); + +template +OpTaskSignature fwd_signature(); + +template +OpTaskSignature bwd_signature(); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/op_tensor_spec.h b/lib/local-execution/include/local-execution/op_tensor_spec.h new file mode 100644 index 0000000000..cc2cd75153 --- /dev/null +++ b/lib/local-execution/include/local-execution/op_tensor_spec.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H + +#include "local-execution/op_task_signature.h" + +namespace FlexFlow { + +struct OpTensorSpec { + TensorRole role; + OpSlotOptions slot_option; + req idx; +}; +FF_VISITABLE_STRUCT(OpTensorSpec, role, slot_option, idx); + +OpTensorSpec input_tensor(int); +OpTensorSpec output_tensor(int); +OpTensorSpec weight_tensor(int); + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/src/permissions.h b/lib/local-execution/include/local-execution/permissions.h similarity index 84% rename from lib/runtime/src/permissions.h rename to lib/local-execution/include/local-execution/permissions.h index e7793a1dcb..ce19e38e7e 100644 --- a/lib/runtime/src/permissions.h +++ b/lib/local-execution/include/local-execution/permissions.h @@ -1,18 +1,13 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_PERMISSION_H -#define _FLEXFLOW_RUNTIME_SRC_PERMISSION_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H +#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H -#include "legion.h" #include "utils/exception.h" #include "utils/fmt.h" -#include "utils/optional.h" namespace FlexFlow { enum class Permissions { NONE, RO, WO, RW }; -Legion::PrivilegeMode to_legion(Permissions); -optional from_legion(Legion::PrivilegeMode); - Permissions join(Permissions lhs, Permissions rhs); Permissions meet(Permissions lhs, Permissions rhs); diff --git a/lib/runtime/include/runtime/profiling.h b/lib/local-execution/include/local-execution/profiling.h similarity index 64% rename from lib/runtime/include/runtime/profiling.h rename to lib/local-execution/include/local-execution/profiling.h index 3f43ede520..bd50801fc4 100644 --- a/lib/runtime/include/runtime/profiling.h +++ b/lib/local-execution/include/local-execution/profiling.h @@ -1,21 +1,20 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_PROFILING_H -#define _FLEXFLOW_RUNTIME_SRC_PROFILING_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H +#define _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H #include "kernels/profiling.h" -#include "legion.h" -#include "loggers.h" +#include "spdlog/spdlog.h" namespace FlexFlow { enum class EnableProfiling { YES, NO }; template -optional +std::optional profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) { - optional elapsed = + std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); if (elapsed.has_value()) { - log_profile.debug(s, elapsed.value()); + spdlog::debug(s, elapsed.value()); } return elapsed; } diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h similarity index 73% rename from lib/runtime/src/task_spec/runtime_arg_ref.h rename to lib/local-execution/include/local-execution/runtime_arg_ref.h index 655300e692..295f32455c 100644 --- a/lib/runtime/src/task_spec/runtime_arg_ref.h +++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H -#include "arg_ref.h" -#include "device_specific.h" -#include "runtime/config.h" +#include "local-execution/arg_ref.h" +#include "local-execution/config.h" +#include "local-execution/device_specific.h" namespace FlexFlow { @@ -20,7 +20,7 @@ using RuntimeArgRefSpec = ArgRefSpec; RuntimeArgRef profiling_settings(); RuntimeArgRef> ff_handle(); -RuntimeArgRef iteration_config(); +RuntimeArgRef> iteration_config(); } // namespace FlexFlow diff --git a/lib/runtime/src/serialization.h b/lib/local-execution/include/local-execution/serialization.h similarity index 55% rename from lib/runtime/src/serialization.h rename to lib/local-execution/include/local-execution/serialization.h index 65601990b0..147ed8159c 100644 --- a/lib/runtime/src/serialization.h +++ b/lib/local-execution/include/local-execution/serialization.h @@ -1,12 +1,9 @@ -#ifndef _FLEXFLOW_RUNTIME_SERIALIZATION_H -#define _FLEXFLOW_RUNTIME_SERIALIZATION_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H #include "kernels/device.h" #include "kernels/nccl.h" -#include "legion.h" -#include "legion/legion_utilities.h" #include "op-attrs/dim_ordered.h" -#include "utils/optional.h" #include "utils/required.h" #include "utils/type_traits.h" #include "utils/variant.h" @@ -28,23 +25,6 @@ namespace FlexFlow { template struct needs_serialization {}; -/* template */ -/* class Serializer { */ -/* void serialize(Legion::Serializer &, T const &) const; */ -/* void deserialize(Legion::Deserializer &, T &) const; */ -/* }; */ - -/* template struct trivially_serializable; */ - -/* template struct - * visit_trivially_serializable; */ - -/* template >::value && - * visit_serializable::value)>::type> */ - template struct visit_trivially_serializable; @@ -101,6 +81,10 @@ struct is_trivially_serializable< typename std::enable_if::value>::type> : std::true_type {}; +template +struct is_trivially_serializable> + : is_trivially_serializable {}; + template struct is_trivially_serializable> : is_trivially_serializable {}; @@ -155,108 +139,6 @@ static_assert(std::is_same, static_assert(visit_trivially_serializable::value, ""); static_assert(is_trivially_serializable::value, ""); -template -struct Serialization { - void serialize(Legion::Serializer &, T const &) const; - T deserialize(Legion::Deserializer &) const; -}; - -template -struct Serialization< - T, - typename std::enable_if::value>::type> { - static void serialize(Legion::Serializer &sez, T const &t) { - sez.serialize(&t, sizeof(T)); - } - - static T const &deserialize(Legion::Deserializer &dez) { - void const *cur = dez.get_current_pointer(); - dez.advance_pointer(sizeof(T)); - return *(T const *)cur; - } -}; - -struct needs_serialize_visitor { - bool result = true; - - template - void operator()(char const *, T const &t) { - result &= needs_serialize(t); - } -}; - -template -bool visit_needs_serialize(T const &t) { - needs_serialize_visitor vis; - visit_struct::for_each(t, vis); - return vis.result; -} - -struct serialize_visitor { - serialize_visitor() = delete; - explicit serialize_visitor(Legion::Serializer &sez) : sez(sez) {} - - Legion::Serializer &sez; - - template - void operator()(char const *, T const &t) { - serialize(this->sez, t); - } -}; - -template -void visit_serialize(Legion::Serializer &sez, T const &t) { - serialize_visitor vis(sez); - visit_struct::for_each(t, vis); -} - -struct deserialize_visitor { - deserialize_visitor() = delete; - explicit deserialize_visitor(Legion::Deserializer &dez) : dez(dez) {} - - Legion::Deserializer &dez; - - template - T const &operator()(char const *, T &t) { - deserialize(dez, t); - } -}; - -template -T const &visit_deserialize(Legion::Deserializer &dez) { - deserialize_visitor vis(dez); - return visit_struct::for_each(vis); -} - -template -class VisitSerialize { - void serialize(Legion::Serializer &sez, T const &t) const { - return visit_serialize(sez, t); - } - - T const &deserialize(Legion::Deserializer &dez) const { - return visit_deserialize(dez); - } -}; - -template -size_t ff_task_serialize(Legion::Serializer &sez, T const &t) { - static_assert(is_serializable::value, "Type must be serializable"); - - size_t pre_size = sez.get_used_bytes(); - Serialization::serialize(sez, t); - size_t post_size = sez.get_used_bytes(); - - return post_size - pre_size; -} - -template -T const &ff_task_deserialize(Legion::Deserializer &dez) { - static_assert(is_serializable::value, "Type must be serializable"); - - return Serialization::deserialize(dez); -} - } // namespace FlexFlow #endif diff --git a/lib/runtime/src/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h similarity index 94% rename from lib/runtime/src/sim_environment.h rename to lib/local-execution/include/local-execution/sim_environment.h index 4297d9d970..78608a3228 100644 --- a/lib/runtime/src/sim_environment.h +++ b/lib/local-execution/include/local-execution/sim_environment.h @@ -1,12 +1,13 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H -#define _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H +#define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H -#include "cost_metrics.h" #include "kernels/accessor.h" #include "kernels/allocation.h" +#include "local-execution/cost_metrics.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/task_argument_accessor.h" #include "op-attrs/parallel_tensor_shape.h" -#include "task_spec/op_task_invocation.h" -#include "task_spec/task_argument_accessor.h" +#include "pcg/machine_view.h" #include namespace FlexFlow { diff --git a/lib/runtime/include/runtime/task_spec/slot_id.h b/lib/local-execution/include/local-execution/slot_id.h similarity index 73% rename from lib/runtime/include/runtime/task_spec/slot_id.h rename to lib/local-execution/include/local-execution/slot_id.h index a5e4322d3c..53820fdb2f 100644 --- a/lib/runtime/include/runtime/task_spec/slot_id.h +++ b/lib/local-execution/include/local-execution/slot_id.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H #include "utils/strong_typedef.h" diff --git a/lib/runtime/src/task_spec/slot_type.h b/lib/local-execution/include/local-execution/slot_type.h similarity index 86% rename from lib/runtime/src/task_spec/slot_type.h rename to lib/local-execution/include/local-execution/slot_type.h index 64b79ee281..957f89fa4e 100644 --- a/lib/runtime/src/task_spec/slot_type.h +++ b/lib/local-execution/include/local-execution/slot_type.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H +#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H #include "utils/fmt.h" diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h new file mode 100644 index 0000000000..663c862e18 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -0,0 +1,155 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/linear_kernels.h" +#include "local-execution/arg_ref.h" +#include "local-execution/concrete_arg.h" +#include "local-execution/config.h" +#include "local-execution/device_specific.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/permissions.h" +#include "local-execution/tasks.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "utils/variant.h" +#include +#include +#include +#include +#include +#include + +namespace FlexFlow { + +template +struct privilege_mode_to_accessor_t {}; + +template <> +struct privilege_mode_to_accessor_t { + using type = GenericTensorAccessorW; +}; + +template <> +struct privilege_mode_to_accessor_t { + using type = GenericTensorAccessorR; +}; + +template <> +struct privilege_mode_to_accessor_t { + using type = GenericTensorAccessorW; +}; + +template +using privilege_mode_to_accessor = + typename privilege_mode_to_accessor_t::type; + +using PrivilegeType = + std::variant; +using PrivilegeVariadicType = std::variant, + std::vector>; + +// TODO: define device state variant in another file +using DeviceStates = std::variant; + +using OpArgRefTypeBacking = + std::variant>; +using RuntimeArgRefTypeBacking = std::variant, + FFIterationConfig>; + +using ArgRefBacking = std:: + variant; + +struct ITaskArgumentAccessor { + ITaskArgumentAccessor &operator=(ITaskArgumentAccessor const &) = delete; + + virtual ~ITaskArgumentAccessor() = default; + + virtual ConcreteArgSpec const &get_concrete_arg(slot_id) const = 0; + virtual OpArgRefTypeBacking const &get_op_arg_ref(slot_id) const = 0; + virtual RuntimeArgRefTypeBacking const &get_runtime_arg(slot_id) const = 0; + + virtual PrivilegeType + get_tensor(slot_id slot, Permissions priv, IsGrad is_grad) const = 0; + virtual PrivilegeVariadicType get_variadic_tensor(slot_id slot, + Permissions priv, + IsGrad is_grad) const = 0; + + virtual Allocator get_allocator() const = 0; + virtual size_t get_device_idx() const = 0; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(ITaskArgumentAccessor); + +struct TaskArgumentAccessor { + template + T const &get_argument(slot_id slot) const { + if constexpr (is_in_variant::value) { + return std::get(this->ptr->get_op_arg_ref(slot)); + } else if constexpr (is_in_variant::value) { + return std::get(this->ptr->get_runtime_arg(slot)); + } else { + return this->ptr->get_concrete_arg(slot).get(); + } + } + + template + privilege_mode_to_accessor get_tensor(slot_id slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, IsGrad::NO)); + } + + template + privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, IsGrad::YES)); + } + + template + std::vector> + get_variadic_tensor(slot_id slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO)); + } + + template + std::vector> + get_variadic_tensor_grad(slot_id slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES)); + } + + Allocator get_allocator() const { + return this->ptr->get_allocator(); + } + + template + static + typename std::enable_if::value, + TaskArgumentAccessor>::type + create(Args &&...args) { + return TaskArgumentAccessor( + std::make_shared(std::forward(args)...)); + } + +private: + TaskArgumentAccessor(std::shared_ptr ptr) + : ptr(ptr) {} + std::shared_ptr ptr; +}; + +using DeviceStates = std::variant; + +using TaskImplFunction = std::variant< + std::function, + std::function(TaskArgumentAccessor const &)>>; + +template +TaskImplFunction get_task_impl(); + +template +OpTaskSignature get_signature(); + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/src/tasks.h b/lib/local-execution/include/local-execution/tasks.h similarity index 95% rename from lib/runtime/src/tasks.h rename to lib/local-execution/include/local-execution/tasks.h index 0e07fa3f85..c78fefd4ea 100644 --- a/lib/runtime/src/tasks.h +++ b/lib/local-execution/include/local-execution/tasks.h @@ -1,8 +1,9 @@ -#ifndef _FLEXFLOW_TASKS_H -#define _FLEXFLOW_TASKS_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H -#include "utils/optional.h" +#include #include +#include namespace FlexFlow { @@ -170,9 +171,9 @@ template void register_task(task_id_t, std::string const &name, F const &func, - optional cpu_func = nullopt); + std::optional cpu_func = std::nullopt); -template +template void register_task(); void register_tasks(); diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h similarity index 94% rename from lib/local-execution/include/tracked_allocator.h rename to lib/local-execution/include/local-execution/tracked_allocator.h index 4f51670426..ea3eec64e0 100644 --- a/lib/local-execution/include/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H #include "kernels/allocation.h" -#include "local_allocator.h" +#include "local-execution/local_allocator.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/variadic_tensor_ref.h b/lib/local-execution/include/local-execution/variadic_tensor_ref.h new file mode 100644 index 0000000000..56da1bab64 --- /dev/null +++ b/lib/local-execution/include/local-execution/variadic_tensor_ref.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H + +#include "local-execution/arg_ref.h" +#include "local-execution/op_tensor_spec.h" + +namespace FlexFlow { + +enum class VariadicTensorRefType { INPUT_TENSORS }; + +template +using VariadicTensorRef = ArgRef; + +VariadicTensorRef get_input_tensors(); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local_allocator.cc b/lib/local-execution/src/local_allocator.cc index 0bb7d04574..d393643ead 100644 --- a/lib/local-execution/src/local_allocator.cc +++ b/lib/local-execution/src/local_allocator.cc @@ -1,4 +1,4 @@ -#include "local_allocator.h" +#include "local-execution/local_allocator.h" #include "kernels/device.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc new file mode 100644 index 0000000000..8e9b56272b --- /dev/null +++ b/lib/local-execution/src/op_arg_ref.cc @@ -0,0 +1,14 @@ +#include "local-execution/op_arg_ref.h" + +namespace FlexFlow { + +template +OpArgRef> per_device_op_state() { + return {OpArgRefType::PER_DEVICE_OP_STATE}; +} + +OpArgRef input_parallel_tensor_shape(int idx) { + return {OpArgRefType::PARALLEL_TENSOR_SHAPE}; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc new file mode 100644 index 0000000000..adad2f3a72 --- /dev/null +++ b/lib/local-execution/src/op_task_invocation.cc @@ -0,0 +1,100 @@ +#include "local-execution/op_task_invocation.h" + +namespace FlexFlow { + +OpTensorSpec input_tensor(int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY) { + return {TensorRole::INPUT, option, idx}; +} + +OpTensorSpec output_tensor(int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY) { + return {TensorRole::OUTPUT, option, idx}; +} + +OpTensorSpec weight_tensor(int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY) { + return {TensorRole::WEIGHT, option, idx}; +} + +void OpTaskBinding::bind(slot_id slot, OpTensorSpec const &tensor_spec) { + this->tensor_bindings.insert({{slot, IsGrad::NO}, tensor_spec}); +} + +void OpTaskBinding::bind_grad(slot_id slot, OpTensorSpec const &tensor_spec) { + this->tensor_bindings.insert({{slot, IsGrad::YES}, tensor_spec}); +} + +std::unordered_map, OpTensorSpec> const & + OpTaskBinding::get_tensor_bindings() const { + return this->tensor_bindings; +} + +std::unordered_map const & + OpTaskBinding::get_arg_bindings() const { + return this->arg_bindings; +} + +OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) { + OpTaskBinding bwd; + bwd.arg_bindings = fwd.get_arg_bindings(); + bwd.tensor_bindings = fwd.get_tensor_bindings(); + for (auto const &[key, spec] : fwd.get_tensor_bindings()) { + OpSlotOptions slot_option = spec.slot_option; + if (slot_option != OpSlotOptions::UNTRAINABLE || + slot_option != OpSlotOptions::OPTIONAL_UNTRAINABLE) { + slot_id slot = key.first; + bwd.bind_grad(slot, spec); + } + } + return bwd; +} + +bool is_op_tensor_spec_invalid(OpTensorSlotSpec tensor_slot_spec, + OpTensorSpec tensor_spec) { + return tensor_spec.role != tensor_slot_spec.tensor_role || + tensor_spec.slot_option != tensor_slot_spec.slot_option; +} + +bool is_tensor_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv) { + auto tensor_bindings = inv.binding.get_tensor_bindings(); + for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { + std::pair tensor_key = + std::make_pair(op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad); + OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key); + if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { + return false; + } + } + return true; +} + +bool is_arg_type_invalid(std::type_index expected_arg_type, + OpArgSpec op_arg_spec) { + std::type_index arg_spec_type = std::visit( + [](auto &&arg) -> std::type_index { return arg.get_type_index(); }, + op_arg_spec); + return arg_spec_type != expected_arg_type; +} + +bool is_arg_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv) { + auto sig_arg_types = sig.get_arg_types(); + for (auto arg_binding : inv.binding.get_arg_bindings()) { + std::type_index arg_type = sig_arg_types.at(arg_binding.first); + if (is_arg_type_invalid(arg_type, arg_binding.second)) { + return false; + } + } + + return true; +} + +bool is_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv) { + return is_tensor_invocation_valid(sig, inv) && + is_arg_invocation_valid(sig, inv); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc new file mode 100644 index 0000000000..53a685910e --- /dev/null +++ b/lib/local-execution/src/op_task_signature.cc @@ -0,0 +1,81 @@ +#include "local-execution/op_task_signature.h" + +namespace FlexFlow { + +OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){}; + +void OpTaskSignature::add_input_slot(slot_id name, SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_optional_input_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_untrainable_input_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::INPUT, + IsGrad::NO, + OpSlotOptions::UNTRAINABLE}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_optional_untrainable_input_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::INPUT, + IsGrad::NO, + OpSlotOptions::OPTIONAL_UNTRAINABLE}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_output_slot(slot_id name, SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_bwd_necessary_output_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::OUTPUT, + IsGrad::NO, + OpSlotOptions::NECESSARY}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_weight_slot(slot_id name, SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::WEIGHT, + IsGrad::NO, + OpSlotOptions::NECESSARY}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_optional_weight_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::set_arg_types( + std::unordered_map const &arg_type) { + this->task_arg_types = arg_type; +} + +void OpTaskSignature::add_from_slot_spec(OpTensorSlotSpec const &spec) { + this->op_tensor_slots.insert(spec); +} + +} // namespace FlexFlow diff --git a/lib/runtime/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc similarity index 84% rename from lib/runtime/src/ops/attention.cc rename to lib/local-execution/src/ops/attention.cc index 41905f9014..6e6d23cd4a 100644 --- a/lib/runtime/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -15,19 +15,12 @@ #include "attention.h" #include "kernels/attention_kernels.h" -#include "legion.h" -#include "op-attrs/ops/attention.h" -#include "task_spec/op_task_signature.h" +#include "local-execution/op_task_signature.h" namespace FlexFlow { using namespace FlexFlow::Kernels::MultiHeadAttention; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { QUERY_PARALLEL_TENSOR_SHAPE, KEY_PARALLEL_TENSOR_SHAPE, @@ -86,6 +79,12 @@ OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) { return {ATTENTION_BWD_TASK_ID, b}; } +// OpArgBacking +// generate_op_arg_backing(std::vector +// tensor_shape_args) { + +// } + static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); @@ -122,35 +121,42 @@ static DeviceSpecific int num_samples = get_piece_shape(query_parallel_tensor_shape)[ff_dim_t(2)]; int num_heads = get_piece_shape(weight_parallel_tensor_shape)[ff_dim_t(1)]; + // MHAPerDeviceState per_device_state = + // init_kernel(handle, + // allocator, + // num_samples, + // num_heads, + // qSize, + // kSize, + // vSize, + // qProjSize, + // kProjSize, + // vProjSize, + // oProjSize, + // qoSeqLength, + // kvSeqLength, + // attrs.add_bias_kv); + // return acc.create_device_specific(per_device_state); + DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - num_samples, - num_heads, - qSize, - kSize, - vSize, - qProjSize, - kProjSize, - vProjSize, - oProjSize, - qoSeqLength, - kvSeqLength, - attrs.add_bias_kv)); + init_kernel(handle, + allocator, + num_samples, + num_heads, + qSize, + kSize, + vSize, + qProjSize, + kProjSize, + vProjSize, + oProjSize, + qoSeqLength, + kvSeqLength, + attrs.add_bias_kv); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto query = acc.get_tensor(QUERY); auto key = acc.get_tensor(KEY); auto value = acc.get_tensor(VALUE); @@ -162,7 +168,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[MultiHeadAttention] forward_time = %.2lfms\n", + "[MultiHeadAttention] forward_time = {:.2lf}ms\n", per_device_state, query.get_float_ptr(), key.get_float_ptr(), @@ -171,15 +177,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto query = acc.get_tensor(QUERY); auto key = acc.get_tensor(KEY); auto value = acc.get_tensor(VALUE); @@ -208,7 +207,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[MultiHeadAttention] backward_time = %.2lfms\n", + "[MultiHeadAttention] backward_time = {:.2lf}ms\n", per_device_state, query.get_float_ptr(), query_grad.get_float_ptr(), @@ -221,14 +220,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, MultiHeadAttentionAttrs const &attrs, InputParallelTensorDesc const &query_shape, @@ -307,7 +298,12 @@ void register_task() { register_task(ATTENTION_INIT_TASK_ID, "Attention Init", init_signature(), - init_task); + init_task_impl); +} + +template <> +OpTaskSignature get_signature() { + return init_signature(); } template <> @@ -331,13 +327,13 @@ void register_task() { register_task(ATTENTION_FWD_TASK_ID, "Attention Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> OpTaskSignature bwd_signature() { OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(ATTENTION_FWD_TASK_ID)); + infer_bwd_signature(fwd_signature()); return bwd; } @@ -347,7 +343,7 @@ void register_task() { register_task(ATTENTION_BWD_TASK_ID, "Attention Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/attention.h b/lib/local-execution/src/ops/attention.h similarity index 91% rename from lib/runtime/src/ops/attention.h rename to lib/local-execution/src/ops/attention.h index 09a4ef036f..c8eb17ecec 100644 --- a/lib/runtime/src/ops/attention.h +++ b/lib/local-execution/src/ops/attention.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_ATTENTION_H #define _FLEXFLOW_ATTENTION_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/attention.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc similarity index 87% rename from lib/runtime/src/ops/batch_matmul.cc rename to lib/local-execution/src/ops/batch_matmul.cc index 5f40def699..187e97ecaa 100644 --- a/lib/runtime/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -15,20 +15,14 @@ #include "batch_matmul.h" #include "kernels/batch_matmul_kernels.h" -#include "legion.h" +#include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/batch_matmul.h" -#include "task_spec/op_task_signature.h" namespace FlexFlow { using namespace FlexFlow::Kernels::BatchMatmul; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { A_INPUT, // tensor B_INPUT, // tensor @@ -60,7 +54,7 @@ OpTaskInvocation backward(BatchMatmulAttrs const &attrs) { return {BATCHMATMUL_BWD_TASK_ID, bwd}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto a_input = acc.get_tensor(A_INPUT); auto b_input = acc.get_tensor(B_INPUT); auto output = acc.get_tensor(OUTPUT); @@ -91,7 +85,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[BatchMatmul] forward_time = %.2lfms\n", + "[BatchMatmul] forward_time = {:.2lf}ms\n", handle, output.get_float_ptr(), a_input.get_float_ptr(), @@ -105,15 +99,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { iter_config.seq_length); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { // BatchMatmul* bmm = (BatchMatmul*) task->args; FFIterationConfig iter_config = acc.get_argument(ITERATION_CONFIG); @@ -151,7 +138,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[BatchMatmul] backward_time = %.2lfms\n", + "[BatchMatmul] backward_time = {:.2lf}ms\n", handle, output.get_float_ptr(), output_grad.get_float_ptr(), @@ -165,14 +152,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { batch); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, BatchMatmulAttrs const &attrs, InputParallelTensorDesc const &a_input, @@ -225,7 +204,7 @@ void register_task() { register_task(BATCHMATMUL_FWD_TASK_ID, "BatchMatmul Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -241,7 +220,7 @@ void register_task() { register_task(BATCHMATMUL_BWD_TASK_ID, "BatchMatmul Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h similarity index 84% rename from lib/runtime/src/ops/batch_matmul.h rename to lib/local-execution/src/ops/batch_matmul.h index 7d3f2308da..94457c22be 100644 --- a/lib/runtime/src/ops/batch_matmul.h +++ b/lib/local-execution/src/ops/batch_matmul.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_BATCH_MATMUL_H #define _FLEXFLOW_BATCH_MATMUL_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/batch_matmul.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" -#include "task_spec/op_task_signature.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc similarity index 81% rename from lib/runtime/src/ops/batch_norm.cc rename to lib/local-execution/src/ops/batch_norm.cc index a52981a8a3..97830f90fe 100644 --- a/lib/runtime/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -15,17 +15,11 @@ #include "batch_norm.h" #include "kernels/batch_norm_kernels.h" -#include "legion/legion_utilities.h" namespace FlexFlow { using namespace FlexFlow::Kernels::BatchNorm; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { INPUT, // tensor SCALE, // tensor @@ -88,29 +82,19 @@ static DeviceSpecific float *runningMean; DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - runningMean, - output_n, - output_c, - output_h, - output_w, - attrs.relu)); + init_kernel(handle, + allocator, + runningMean, + output_n, + output_c, + output_h, + output_w, + attrs.relu); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -122,23 +106,16 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[BatchNorm] forward_time = %.2lfms\n", - &per_device_state, + "[BatchNorm] forward_time = {:.2lf}ms\n", + per_device_state, input.get_float_ptr(), output.get_float_ptr(), scale.get_float_ptr(), bias.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -153,8 +130,8 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[BatchNorm] backward_time = %.2lfms\n", - &per_device_state, + "[BatchNorm] backward_time = {:.2lf}ms\n", + per_device_state, input.get_float_ptr(), output_grad.get_float_ptr(), output.get_float_ptr(), @@ -165,14 +142,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output.shape.get_volume()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, BatchNormAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -221,6 +190,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); + init.add_input_slot(INPUT); init.add_input_slot(BIAS); init.add_output_slot(OUTPUT); @@ -236,7 +206,7 @@ void register_task() { register_task(BATCHNORM_INIT_TASK_ID, "BatchNorm Init", init_signature(), - init_task); + init_task_impl); } template <> @@ -258,7 +228,7 @@ void register_task() { register_task(BATCHNORM_FWD_TASK_ID, "BatchNorm Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -274,7 +244,7 @@ void register_task() { register_task(BATCHNORM_BWD_TASK_ID, "BatchNorm Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/batch_norm.h b/lib/local-execution/src/ops/batch_norm.h similarity index 89% rename from lib/runtime/src/ops/batch_norm.h rename to lib/local-execution/src/ops/batch_norm.h index 906e85a57c..1745a5cac8 100644 --- a/lib/runtime/src/ops/batch_norm.h +++ b/lib/local-execution/src/ops/batch_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_BATCH_NORM_H #define _FLEXFLOW_BATCH_NORM_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/batch_norm.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc similarity index 79% rename from lib/runtime/src/ops/cast.cc rename to lib/local-execution/src/ops/cast.cc index 44230eaf46..9e1f777d73 100644 --- a/lib/runtime/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -15,17 +15,12 @@ #include "cast.h" #include "kernels/cast_kernels.h" -#include "legion/legion_utilities.h" -#include "task_spec/op_task_signature.h" + +#include "local-execution/op_task_signature.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Cast; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - namespace FlexFlow { enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; @@ -48,7 +43,7 @@ OpTaskInvocation backward(CastAttrs const &attrs) { return {CAST_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -57,22 +52,15 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Cast] forward_time = %.2lfms\n", + "[Cast] forward_time = {:.2lf}ms\n", input, output, input.data_type, attrs.dtype); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -83,21 +71,13 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Cast] forward_time = %.2lfms\n", + "[Cast] forward_time = {:.2lf}ms\n", input_grad, output_grad, input.data_type, attrs.dtype); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, CastAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -143,7 +123,7 @@ void register_task() { register_task(CAST_FWD_TASK_ID, "Cast Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -158,7 +138,7 @@ void register_task() { register_task(CAST_BWD_TASK_ID, "Cast Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/cast.h b/lib/local-execution/src/ops/cast.h similarity index 93% rename from lib/runtime/src/ops/cast.h rename to lib/local-execution/src/ops/cast.h index c0c500e869..69aeadf497 100644 --- a/lib/runtime/src/ops/cast.h +++ b/lib/local-execution/src/ops/cast.h @@ -15,9 +15,9 @@ #ifndef _FLEXFLOW_CAST_H #define _FLEXFLOW_CAST_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/cast.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc similarity index 75% rename from lib/runtime/src/ops/combine.cc rename to lib/local-execution/src/ops/combine.cc index 46d5ebb4fe..6df09b53f4 100644 --- a/lib/runtime/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -15,15 +15,11 @@ #include "combine.h" #include "kernels/combine_kernels.h" -#include "task_spec/op_task_invocation.h" +#include "local-execution/op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; using namespace FlexFlow::Kernels::Combine; @@ -46,7 +42,7 @@ OpTaskInvocation backward(CombineAttrs const &attrs) { return {COMBINE_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -54,20 +50,13 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Combine] forward_time = %.2lfms\n", + "[Combine] forward_time = {:.2lf}ms\n", input, output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -75,19 +64,11 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Combine] forward_time = %.2lfms\n", + "[Combine] backward_time = {:.2lf}ms\n", input_grad, output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, CombineAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -117,7 +98,7 @@ void register_task() { register_task(COMBINE_FWD_TASK_ID, "Combine Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -133,7 +114,7 @@ void register_task() { register_task(COMBINE_BWD_TASK_ID, "Combine Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/combine.h b/lib/local-execution/src/ops/combine.h similarity index 87% rename from lib/runtime/src/ops/combine.h rename to lib/local-execution/src/ops/combine.h index 6b3a43863b..f9349a01ef 100644 --- a/lib/runtime/src/ops/combine.h +++ b/lib/local-execution/src/ops/combine.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_COMBINE_H #define _FLEXFLOW_COMBINE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/combine.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc similarity index 78% rename from lib/runtime/src/ops/concat.cc rename to lib/local-execution/src/ops/concat.cc index 1ce549cc57..f3c2eba48f 100644 --- a/lib/runtime/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -15,21 +15,16 @@ #include "concat.h" #include "kernels/concat_kernels.h" -#include "legion/legion_utilities.h" + +#include "local-execution/op_task_signature.h" +#include "local-execution/variadic_tensor_ref.h" #include "op-attrs/get_output_shapes.h" -#include "task_spec/op_task_signature.h" -#include "task_spec/variadic_tensor_ref.h" #include "utils/hash-utils.h" namespace FlexFlow { using namespace FlexFlow::Kernels::Concat; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS }; OpTaskInvocation forward(ConcatAttrs const &attrs) { @@ -48,7 +43,7 @@ OpTaskInvocation backward(ConcatAttrs const &attrs) { return {CONCAT_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -59,21 +54,14 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Concat] forward_time = %.2lfms\n", + "[Concat] forward_time = {:.2lf}ms\n", output, inputs, attrs.axis); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -84,20 +72,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Concat] backward_time = %.2lfms\n", + "[Concat] backward_time = {:.2lf}ms\n", output_grad, input_grads, attrs.axis); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, ConcatAttrs const &attrs, @@ -132,6 +112,7 @@ CostMetrics template <> OpTaskSignature fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); + fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUTS, SlotType::VARIADIC); @@ -145,13 +126,13 @@ void register_task() { register_task(CONCAT_FWD_TASK_ID, "Concat Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> OpTaskSignature bwd_signature() { OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(CONCAT_FWD_TASK_ID)); + infer_bwd_signature(fwd_signature()); return bwd; } @@ -161,7 +142,7 @@ void register_task() { register_task(CONCAT_BWD_TASK_ID, "Concat Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/concat.h b/lib/local-execution/src/ops/concat.h similarity index 89% rename from lib/runtime/src/ops/concat.h rename to lib/local-execution/src/ops/concat.h index 27dec47743..fa61d87e77 100644 --- a/lib/runtime/src/ops/concat.h +++ b/lib/local-execution/src/ops/concat.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONCAT_H #define _FLEXFLOW_CONCAT_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/concat.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc similarity index 78% rename from lib/runtime/src/ops/conv_2d.cc rename to lib/local-execution/src/ops/conv_2d.cc index 01d8abab55..eef4c21a45 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -1,17 +1,10 @@ #include "conv_2d.h" #include "kernels/conv_2d_kernels.h" -#include "legion/legion_utilities.h" -#include "mpark/variant.hpp" #include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::Conv2D; enum Slots { @@ -70,33 +63,23 @@ static DeviceSpecific auto filter_grad = acc.get_tensor_grad(FILTER); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - attrs.activation, - attrs.kernel_h, - attrs.kernel_w, - attrs.groups, - attrs.padding_h, - attrs.padding_w, - attrs.stride_h, - attrs.stride_w, - input, - output, - filter.get_float_ptr(), - filter_grad.get_float_ptr())); + init_kernel(handle, + attrs.activation, + attrs.kernel_h, + attrs.kernel_w, + attrs.groups, + attrs.padding_h, + attrs.padding_w, + attrs.stride_h, + attrs.stride_w, + input, + output, + filter.get_float_ptr(), + filter_grad.get_float_ptr()); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -109,7 +92,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Conv2d] forward_time = %.2lfms\n", + "[Conv2d] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr(), @@ -118,15 +101,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { attrs.activation); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -143,7 +119,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Conv2d] backward_time = %.2lfms\n", + "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), input_grad.get_float_ptr(), @@ -155,14 +131,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { attrs.activation); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, Conv2DAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -228,7 +196,7 @@ void register_task() { register_task(CONV2D_INIT_TASK_ID, "Conv2d Init", init_signature(), - init_task); + init_task_impl); } template <> @@ -252,7 +220,7 @@ void register_task() { register_task(CONV2D_FWD_TASK_ID, "Conv2d Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -268,7 +236,7 @@ void register_task() { register_task(CONV2D_BWD_TASK_ID, "Conv2d Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/local-execution/src/ops/conv_2d.h similarity index 89% rename from lib/runtime/src/ops/conv_2d.h rename to lib/local-execution/src/ops/conv_2d.h index 7225099a47..0c8181adce 100644 --- a/lib/runtime/src/ops/conv_2d.h +++ b/lib/local-execution/src/ops/conv_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONV_2D_H #define _FLEXFLOW_CONV_2D_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/conv_2d.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc similarity index 77% rename from lib/runtime/src/ops/dropout.cc rename to lib/local-execution/src/ops/dropout.cc index fe85afea38..9d680054ea 100644 --- a/lib/runtime/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -1,18 +1,12 @@ #include "dropout.h" #include "kernels/dropout_kernels.h" -#include "legion/legion_utilities.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" -#include "task_spec/op_task_invocation.h" -#include "task_spec/task_signature.h" #include "utils/hash-utils.h" namespace FlexFlow { -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::Dropout; enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING }; @@ -54,21 +48,11 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator)); + init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -77,21 +61,14 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Dropout] forward_time = %.2lfms\n", + "[Dropout] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -102,20 +79,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Dropout] backward_time = %.2lfms\n", + "[Dropout] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), input_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, DropoutAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -156,6 +125,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); + init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(FF_HANDLE); init.add_output_slot(OUTPUT); @@ -170,7 +140,7 @@ void register_task() { register_task(DROPOUT_INIT_TASK_ID, "Dropout Init", init_signature(), - init_task); + init_task_impl); } template <> @@ -191,7 +161,7 @@ void register_task() { register_task(DROPOUT_FWD_TASK_ID, "Dropout Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -207,7 +177,7 @@ void register_task() { register_task(DROPOUT_BWD_TASK_ID, "Dropout Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/dropout.h b/lib/local-execution/src/ops/dropout.h similarity index 85% rename from lib/runtime/src/ops/dropout.h rename to lib/local-execution/src/ops/dropout.h index 88a255d140..53fbeb3857 100644 --- a/lib/runtime/src/ops/dropout.h +++ b/lib/local-execution/src/ops/dropout.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_DROPOUT_H #define _FLEXFLOW_DROPOUT_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" +#include "local-execution/tasks.h" #include "op-attrs/ops/dropout.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" -#include "tasks.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc similarity index 81% rename from lib/runtime/src/ops/element_binary.cc rename to lib/local-execution/src/ops/element_binary.cc index f6be2198ca..a2e9ee2ba8 100644 --- a/lib/runtime/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -1,16 +1,11 @@ #include "element_binary.h" #include "kernels/element_binary_kernels.h" -#include "legion/legion_utilities.h" + #include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::ElementBinary; enum Slots { @@ -66,27 +61,17 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - attrs.type, - attrs.should_broadcast_lhs, - attrs.should_broadcast_rhs, - input_lhs.shape, - input_rhs.shape, - output.shape)); + init_kernel(handle, + attrs.type, + attrs.should_broadcast_lhs, + attrs.should_broadcast_rhs, + input_lhs.shape, + input_rhs.shape, + output.shape); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -99,7 +84,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[ElementBinary] forward_time = %.2lfms\n", + "[ElementBinary] forward_time = {:.2lf}ms\n", per_device_state, input_lhs.get_float_ptr(), input_rhs.get_float_ptr(), @@ -109,15 +94,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { handle); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -133,7 +111,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[ElementBinary] backward_time = %.2lfms\n", + "[ElementBinary] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), input_lhs.get_float_ptr(), @@ -146,14 +124,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { handle); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, ElementBinaryAttrs const &attrs, @@ -221,7 +191,7 @@ void register_task() { register_task(ELEMENTBINARY_INIT_TASK_ID, "ElementBinary Init", init_signature(), - init_task); + init_task_impl); } template <> @@ -245,7 +215,7 @@ void register_task() { register_task(ELEMENTBINARY_FWD_TASK_ID, "ElementBinary Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -261,7 +231,7 @@ void register_task() { register_task(ELEMENTBINARY_BWD_TASK_ID, "ElementBinary Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/element_binary.h b/lib/local-execution/src/ops/element_binary.h similarity index 95% rename from lib/runtime/src/ops/element_binary.h rename to lib/local-execution/src/ops/element_binary.h index 342909c468..fa4202dffd 100644 --- a/lib/runtime/src/ops/element_binary.h +++ b/lib/local-execution/src/ops/element_binary.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_ELEMENT_BINARY_H #define _FLEXFLOW_ELEMENT_BINARY_H +#include "local-execution/sim_environment.h" #include "op-attrs/ops/element_binary.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc similarity index 76% rename from lib/runtime/src/ops/element_unary.cc rename to lib/local-execution/src/ops/element_unary.cc index f41a8b3551..2ad5d797f5 100644 --- a/lib/runtime/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -1,15 +1,11 @@ #include "element_unary.h" #include "kernels/element_unary_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; using namespace FlexFlow::Kernels::ElementUnary; @@ -27,7 +23,6 @@ enum Slots { OpTaskInvocation init(ElementUnaryUnifiedAttrs const &attrs) { OpTaskBinding b; - b.bind_arg(HANDLE, ff_handle()); b.bind_arg(ATTRS, attrs); b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0)); @@ -58,32 +53,21 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); ProfilingSettings profiling = acc.get_argument(PROFILING); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); ParallelTensorShape input_shape = acc.get_argument(INPUT_SHAPE); ParallelTensorShape output_shape = get_output_shape(attrs, input_shape); - DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(input_shape, output_shape, attrs)); + DeviceSpecific per_device_state = init_kernel( + get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - auto &handle = acc.get_argument(HANDLE); + auto handle = acc.get_argument(HANDLE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = @@ -91,7 +75,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[ElementUnary] forward_time = %.2lfms\n", + "[ElementUnary] forward_time = {:.2lf}ms\n", per_device_state, attrs, handle, @@ -99,22 +83,15 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - auto &handle = acc.get_argument(HANDLE); + auto handle = acc.get_argument(HANDLE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -122,7 +99,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[ElementUnary] backward_time = %.2lfms\n", + "[ElementUnary] backward_time = {:.2lf}ms\n", per_device_state, attrs, handle, @@ -132,14 +109,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, ElementUnaryUnifiedAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -147,7 +116,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, MachineView const &mv) { auto env = sim.new_environment(); - ParallelTensorShape output_shape = get_output_shape(attrs, input_shape); + ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape); SimTaskBinding init_binding; init_binding.bind_arg(HANDLE, ff_handle()); @@ -182,6 +151,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); + init.add_arg_slot(INPUT_SHAPE); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); @@ -196,7 +166,7 @@ void register_task() { register_task(ELEMENTUNARY_INIT_TASK_ID, "ElementUnary Init", init_signature(), - init_task); + init_task_impl); } template <> @@ -217,7 +187,7 @@ void register_task() { register_task(ELEMENTUNARY_FWD_TASK_ID, "ElementUnary Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -233,7 +203,7 @@ void register_task() { register_task(ELEMENTUNARY_BWD_TASK_ID, "ElementUnary Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/element_unary.h b/lib/local-execution/src/ops/element_unary.h similarity index 85% rename from lib/runtime/src/ops/element_unary.h rename to lib/local-execution/src/ops/element_unary.h index f44efc28db..e0f58e8a75 100644 --- a/lib/runtime/src/ops/element_unary.h +++ b/lib/local-execution/src/ops/element_unary.h @@ -1,15 +1,12 @@ #ifndef _ELEMENT_UNARY_H #define _ELEMENT_UNARY_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/element_unary.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { -using ElementUnaryUnifiedAttrs = - variant; - template <> void register_task(); template <> diff --git a/lib/runtime/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc similarity index 81% rename from lib/runtime/src/ops/embedding.cc rename to lib/local-execution/src/ops/embedding.cc index a1bc915d2f..00d6d033d4 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/local-execution/src/ops/embedding.cc @@ -15,17 +15,11 @@ #include "embedding.h" #include "kernels/embedding_kernels.h" -#include "legion.h" +#include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/embedding.h" namespace FlexFlow { -// declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::Embedding; enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING }; @@ -49,7 +43,7 @@ OpTaskInvocation backward(EmbeddingAttrs const &attrs) { return {EMBED_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); @@ -59,7 +53,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Embedding] forward_time = %.2lfms\n", + "[Embedding] forward_time = {:.2lf}ms\n", input, output, weight, @@ -71,15 +65,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { input.shape[legion_dim_t(1)]); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); @@ -89,7 +76,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Embedding] forward_time = %.2lfms\n", + "[Embedding] backward_time = {:.2lf}ms\n", input, output, weight_grad, @@ -98,15 +85,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), - input.shape[ff_dim_t(0)]); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + input.shape.at(ff_dim_t(0))); } CostMetrics measure_operator_cost(SimEnvFactory const &sim, @@ -158,7 +137,7 @@ void register_task() { register_task(EMBED_FWD_TASK_ID, "Embed Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -172,7 +151,7 @@ void register_task() { register_task(EMBED_BWD_TASK_ID, "Embed Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/embedding.h b/lib/local-execution/src/ops/embedding.h similarity index 88% rename from lib/runtime/src/ops/embedding.h rename to lib/local-execution/src/ops/embedding.h index cd1b14fa66..c33b1161bf 100644 --- a/lib/runtime/src/ops/embedding.h +++ b/lib/local-execution/src/ops/embedding.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_EMBEDDING_H #define _FLEXFLOW_EMBEDDING_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/embedding.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc similarity index 76% rename from lib/runtime/src/ops/flat.cc rename to lib/local-execution/src/ops/flat.cc index f53a6185b6..3c2499da79 100644 --- a/lib/runtime/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -5,10 +5,6 @@ namespace FlexFlow { // declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; using namespace FlexFlow::Kernels::Flat; @@ -30,27 +26,20 @@ OpTaskInvocation backward(FlatAttrs const &attrs) { return {FLAT_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, - "[Flat] forward_time = %.2lfms\n", + "[Flat] forward_time = {:.2lf}ms\n", input, output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -59,20 +48,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Flat] forward_time = %.2lfms\n", + "[Flat] backward_time = {:.2lf}ms\n", input, input_grad.get_float_ptr(), output_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, FlatAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -115,7 +96,7 @@ void register_task() { register_task(FLAT_FWD_TASK_ID, "Flat Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -130,7 +111,7 @@ void register_task() { register_task(FLAT_BWD_TASK_ID, "Flat Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/flat.h b/lib/local-execution/src/ops/flat.h similarity index 93% rename from lib/runtime/src/ops/flat.h rename to lib/local-execution/src/ops/flat.h index 13246028fb..d9ea4d3985 100644 --- a/lib/runtime/src/ops/flat.h +++ b/lib/local-execution/src/ops/flat.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_FLAT_H #define _FLEXFLOW_FLAT_H +#include "local-execution/sim_environment.h" #include "op-attrs/ops/flat.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc new file mode 100644 index 0000000000..50b27d72a6 --- /dev/null +++ b/lib/local-execution/src/ops/gather.cc @@ -0,0 +1,215 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gather.h" +#include "kernels/gather_kernels.h" +#include "local-execution/legion_tensor_shape.h" +#include "op-attrs/get_output_shapes.h" +#include + +namespace FlexFlow { + +using namespace FlexFlow::Kernels::Gather; + +enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE }; + +OpTaskInvocation init(GatherAttrs const &attrs) { + OpTaskBinding binding; + + binding.bind(INPUT, input_tensor(0)); + binding.bind(INDEX, input_tensor(1)); + binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); + + return {GATHER_INIT_TASK_ID, binding}; +} + +OpTaskInvocation forward(GatherAttrs const &attrs) { + OpTaskBinding binding; + + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INDEX, weight_tensor(0)); + + return {GATHER_FWD_TASK_ID, binding}; +} + +OpTaskInvocation backward(GatherAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); + + return {GATHER_BWD_TASK_ID, binding}; +} + +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto index = acc.get_tensor(INDEX); + auto output = acc.get_tensor(OUTPUT); + + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + auto const &attrs = acc.get_argument(ATTRS); + legion_dim_t legion_dim = to_legion(attrs.dim, input.shape.num_dims()); + + assert(input.shape.get_dim() == index.shape.get_dim()); + assert(output.shape.get_dim() == index.shape.get_dim()); + + for (int i = 0; i < input.shape.get_dim(); i++) { + assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]); + if (i != legion_dim.value()) { + assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]); + } + } + + return DeviceSpecific({handle, legion_dim}); +} + +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + + auto input = acc.get_tensor(INPUT); + auto index = acc.get_tensor(INDEX); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Gather] forward_time = {:.2lf}ms\n", + per_device_state, + input, + index, + output); +} + +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto index = acc.get_tensor(INDEX); + auto input_grad = acc.get_tensor_grad(INPUT); + + return profile(backward_kernel, + profiling, + "[Gather] backward_time = {:.2lf}ms\n", + per_device_state, + output_grad, + index, + input_grad); +} + +CostMetrics measure_operator_cost(SimEnvFactory const &sim, + GatherAttrs const &attrs, + InputParallelTensorDesc const &input_shape, + InputParallelTensorDesc const &index_shape, + ProfilingSettings const &settings, + MachineView const &mv) { + + auto env = sim.new_environment(); + + std::vector output_shape = + get_output_shapes(attrs, input_shape.shape, index_shape.shape); + + SimTaskBinding fwd_binding; + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(ATTRS, attrs); + + fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind(INDEX, index_shape); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(GATHER_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(GATHER_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); +} + +template <> +OpTaskSignature init_signature() { + OpTaskSignature init(OpTaskType::INIT); + + init.add_input_slot(INPUT); + init.add_input_slot(INDEX); + init.add_output_slot(OUTPUT); + + init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); + + init.add_return_value(); + + return init; +} + +template <> +void register_task() { + register_task(GATHER_INIT_TASK_ID, + "Gather Init", + init_signature(), + init_task_impl); +} + +template <> +OpTaskSignature fwd_signature() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(ATTRS); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + fwd.add_weight_slot(INDEX); + + return fwd; +} + +template <> +void register_task() { + register_task(GATHER_FWD_TASK_ID, + "Gather Fwd", + fwd_signature(), + forward_task_impl); +} + +template <> +OpTaskSignature bwd_signature() { + OpTaskSignature bwd = + infer_bwd_signature(fwd_signature()); + + return bwd; +} + +template <> +void register_task() { + register_task(GATHER_BWD_TASK_ID, + "Gather Bwd", + bwd_signature(), + backward_task_impl); +} + +}; // namespace FlexFlow diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/src/ops/gather.h new file mode 100644 index 0000000000..e2de09d96a --- /dev/null +++ b/lib/local-execution/src/ops/gather.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_GATHER_H +#define _FLEXFLOW_GATHER_H + +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" +#include "op-attrs/ops/gather.h" + +namespace FlexFlow { + +template <> +void register_task(); +template <> +void register_task(); +template <> +void register_task(); + +OpTaskInvocation init(GatherAttrs const &); +OpTaskInvocation forward(GatherAttrs const &); +OpTaskInvocation backward(GatherAttrs const &); + +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + GatherAttrs const &attrs, + InputParallelTensorDesc const &input, + InputParallelTensorDesc const &index, + ProfilingSettings const &settings, + MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc similarity index 61% rename from lib/runtime/src/ops/layer_norm.cc rename to lib/local-execution/src/ops/layer_norm.cc index 6bc671c249..620758772c 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -15,20 +15,17 @@ #include "layer_norm.h" #include "kernels/layer_norm_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/layer_norm.h" #include "op-attrs/parallel_tensor_shape.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/hash-utils.h" #include -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - namespace FlexFlow { +using namespace FlexFlow::Kernels::LayerNorm; + enum Slots { PROFILING, INPUT, @@ -59,7 +56,7 @@ OpTaskInvocation forward(LayerNormAttrs const &attrs) { b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem b.bind(BETA, weight_tensor(1)); // how to get gmmam and beta b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, per_device_state()); + b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); return {LAYERNORM_FWD_TASK_ID, b}; } @@ -70,71 +67,56 @@ OpTaskInvocation backward(LayerNormAttrs const &attrs) { return {LAYERNORM_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto gamma = acc.get_tensor(GAMMA); - auto beta = acc.get_tensor(BETA); +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto gamma = acc.get_tensor(GAMMA); + auto beta = acc.get_tensor(BETA); ProfilingSettings profiling = acc.get_argument(PROFILING); auto &state = acc.get_argument(PER_DEVICE_STATE); return profile(forward_kernel, profiling, - "[LayerNorm] forward time = %.2lfms\n", + "[LayerNorm] forward time = {:.2lf}ms\n", state, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - beta.get_float_ptr()); + input, + output, + gamma, + beta); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto gamma = acc.get_tensor(GAMMA); -static optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto gamma = acc.get_tensor(GAMMA); - - auto input_grad = acc.get_tensor(INPUT_GRAD); - auto gamma_grad = acc.get_tensor(GAMMA_GRAD); - auto beta_grad = acc.get_tensor(BETA_GRAD); - auto output_grad = acc.get_tensor(OUTPUT_GRAD); + auto input_grad = acc.get_tensor_grad(INPUT); + auto gamma_grad = acc.get_tensor_grad(GAMMA); + auto beta_grad = acc.get_tensor_grad(BETA); + auto output_grad = acc.get_tensor_grad(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); auto &state = acc.get_argument(PER_DEVICE_STATE); return profile(backward_kernel, profiling, - "[LayerNorm] backward time = %.2lfms\n", + "[LayerNorm] backward time = {:.2lf}ms\n", state, - output_grad.get_float_ptr(), - input.get_float_ptr(), - input_grad.get_float_ptr(), - gamma.get_float_ptr(), - gamma_grad.get_float_ptr(), - beta_grad.get_float_ptr()); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + output_grad, + input, + input_grad, + gamma, + gamma_grad, + beta_grad); } static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); + auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); - auto input = acc.get_tensor(INPUT); - FFHandler handle = acc.get_argument(HANDLE); + auto input = acc.get_tensor(INPUT); + auto handle = acc.get_argument(HANDLE); // question: how to get batch_size and effective_num_elements int64_t effective_batch_size, effective_num_elements; @@ -143,29 +125,20 @@ static DeviceSpecific M *= input.shape.at(legion_dim_t(attrs.axes[i])); } int num_replicas = 1; - for (int i = 0; i < intput.shape.num_dims(); i++) { + for (int i = 0; i < input.shape.num_dims(); i++) { num_replicas *= input.shape.at(legion_dim_t(i)); effective_num_elements = M; effective_batch_size = input.shape.get_volume() / M; - - DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - attrs.elementwise_affine, - effective_batch_size, - effective_num_elements, - attrs.eps)); } -} -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); + DeviceSpecific per_device_state = + init_kernel(handle, + allocator, + attrs.elementwise_affine, + effective_batch_size, + effective_num_elements, + attrs.eps); + return per_device_state; } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -173,18 +146,19 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; init_binding.bind_arg(HANDLE, ff_handle()); init_binding.bind_arg(ATTRS, attrs); - init.binding.bind(INPUT, input.shape); + init_binding.bind(INPUT, input.shape); auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); - DeviceSpecific = init_task_impl(init_accessor); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); SimTaskBinding fwd_binding; fwd_binding.bind(INPUT, input.shape); @@ -192,9 +166,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); - // TODO how to handle gamma and beta, where are they from - fwd_binding.bind(GAMMA, input_shape); - fwd_binding.bind(BETA, input_shape); + fwd_binding.bind(GAMMA, input.shape); + fwd_binding.bind(BETA, input.shape); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); @@ -222,7 +195,7 @@ OpTaskSignature fwd_signature() { } template <> -OpTaskSignature bwd_signature() { +OpTaskSignature bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(fwd_signature()); return bwd; @@ -231,6 +204,7 @@ OpTaskSignature bwd_signature() { template <> OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); + init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); @@ -245,7 +219,7 @@ void register_task() { register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signature(), - init_task); + init_task_impl); } template <> @@ -253,15 +227,15 @@ void register_task() { register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd_signature(), - forward_task); + forward_task_impl); } template <> void register_task() { register_task(LAYERNORM_BWD_TASK_ID, "LayerNorm backward", - bwd_signature(), - backward_task); + bwd_signature(), + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/local-execution/src/ops/layer_norm.h similarity index 97% rename from lib/runtime/src/ops/layer_norm.h rename to lib/local-execution/src/ops/layer_norm.h index 83e6733bf6..4eadb9ff09 100644 --- a/lib/runtime/src/ops/layer_norm.h +++ b/lib/local-execution/src/ops/layer_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/layer_norm.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc similarity index 54% rename from lib/runtime/src/ops/linear.cc rename to lib/local-execution/src/ops/linear.cc index 96d037913c..e2c9d9aef4 100644 --- a/lib/runtime/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -1,32 +1,14 @@ #include "linear.h" #include "kernels/linear_kernels.h" -#include "layer.h" -#include "legion/legion_utilities.h" +#include "local-execution/task_argument_accessor.h" #include "op-attrs/ff_dim.h" #include "op-attrs/get_output_shapes.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/graph/views.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - using namespace FlexFlow::Kernels::Linear; enum slots { @@ -43,12 +25,12 @@ enum slots { OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskBinding binding; - bind.bind_arg(HANDLE, ff_handle()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(ATTRS, attrs); - bind.bind(INPUT, input_tensor(0)); // input - bind.bind(WEIGHT, weight_tensor(0)); // weight - bind.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); // input + binding.bind(WEIGHT, weight_tensor(0)); // weight + binding.bind(OUTPUT, output_tensor(0)); // output return {LINEAR_INIT_TASK_ID, binding}; } @@ -56,14 +38,17 @@ OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskInvocation forward(LinearAttrs const &attrs) { OpTaskBinding binding; - bind.bind(INPUT, input_tensor(0)); // input - bind.bind(WEIGHT, weight_tensor(0)); // weight - bind.bind(OUTPUT, output_tensor(0)); // output - bind.bind(BIAS, bias_tensor(0)); // bias + binding.bind(INPUT, input_tensor(0)); // input + binding.bind(WEIGHT, weight_tensor(0)); // weight + binding.bind(OUTPUT, output_tensor(0)); // output + if (attrs.use_bias) { + binding.bind(BIAS, weight_tensor(1)); // bias + } - bing.bind_arg(PROFILING, profiling_settings()); - bind.bind_arg(PER_DEVICE_STATE, per_device_state()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + binding.bind_arg(ATTRS, attrs); return {LINEAR_FWD_TASK_ID, binding}; } @@ -74,51 +59,38 @@ OpTaskInvocation backward(LinearAttrs const &attrs) { return {LINEAR_BWD_TASK_ID, b}; } -static DeviceSpecific - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - Allocator allocator = acc.get_allocator(); +static LinearPerDeviceState init_task_impl(TaskArgumentAccessor const &acc) { + auto const &attrs = acc.get_argument(ATTRS); PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); int out_dim = output.shape.at(ff_dim_t{0}); - int batch_size = output.shape.at.(ff_dim_t{1}); + int batch_size = output.shape.at(ff_dim_t{1}); float *one_ptr; - DeviceSpecific state = - acc.create_device_specific( - init_kernel(handle, - allocator, - one_ptr, - attrs.regularizer, - attrs.use_bias, - input.data_type, - weight.data_type, - output.data_type, - batch_size, - attrs.out_channels)); + LinearPerDeviceState state = init_kernel(handle, + one_ptr, + attrs.regularizer, + attrs.use_bias, + input.data_type, + weight.data_type, + output.data_type, + batch_size, + attrs.out_channels); return state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); auto bias = acc.get_tensor(BIAS); - auto state = acc.get_device_specific(PER_DEVICE_STATE); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); @@ -133,7 +105,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Linear] forward_time = %.2lfms\n", + "[Linear] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr(), @@ -144,15 +116,10 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { batch_size); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -}; +; -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); @@ -161,7 +128,8 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); auto output_grad = acc.get_tensor_grad(OUTPUT); - auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); @@ -176,65 +144,63 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, - "[Linear] backward_time = %.2lfms\n", + "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), - output.get_float_ptr(), - output_grad.get_float_ptr(), - weight.get_float_ptr(), - weight_grad.get_float_ptr(), - bias_ptr, + (void *)input.get_float_ptr(), + (void *)input_grad.get_float_ptr(), + (void *)output.get_float_ptr(), + (void *)output_grad.get_float_ptr(), + (void *)weight.get_float_ptr(), + (void *)weight_grad.get_float_ptr(), + (void *)bias_ptr, in_dim, out_dim, batch_size); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LinearAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); - ParallelTensorShape output_shape = get_output_shape(input.shape, attrs); + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + ParallelTensorShape weight_shape = get_weights_shape(attrs, input.shape); + ParallelTensorShape bias_shape = get_bias_shape(attrs, input.shape); SimTaskBinding init_binding; - init_binding.bind(INPUT, input_tensor(0)); - init_binding.bind(WEIGHT, weight_tensor(0)); - init_binding.bind(BIAS, bias_tensor(0)); - init_binding.bind(OUTPUT, output_tensor(0)); + init_binding.bind(INPUT, input.shape); + init_binding.bind(WEIGHT, weight_shape); + if (attrs.use_bias) { + init_binding.bind(BIAS, bias_shape); + } + init_binding.bind(OUTPUT, output_shape); init_binding.bind_arg(ATTRS, attrs); init_binding.bind_arg(HANDLE, ff_handle()); auto init_accessor = env.get_init_accessor(LINEAR_INIT_TASK_ID, init_binding); - DeviceSpecific per_device_state = - init_task_impl(init_accessor); + LinearPerDeviceState per_device_state = init_task_impl(init_accessor); SimTaskBinding fwd_binding; - fwd_bind.bind(INPUT, input_tensor(0)); // input - fwd_bind.bind(WEIGHT, weight_tensor(0)); // weight - fwd_bind.bind(OUTPUT, output_tensor(0)); // output - fwd_bind.bind(BIAS, bias_tensor(0)); // bias + fwd_binding.bind(INPUT, input.shape); // input + fwd_binding.bind(WEIGHT, weight_shape); // weight + fwd_binding.bind(OUTPUT, output_shape); // output + if (attrs.use_bias) { + fwd_binding.bind(BIAS, bias_shape); // bias + } - fwd_bid.bind_arg(PROFILING, profiling_settings()); - fwd_bind.bind_arg(PER_DEVICE_STATE, per_device_state()); - fwd_bind.bind_arg(ATTRS, attrs); + fwd_binding.bind_arg(PROFILING, profiling_settings()); + fwd_binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + fwd_binding.bind_arg(ATTRS, attrs); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_accessor(LINEAR_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_accessor(LINEAR_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = env.get_fwd_accessor(LINEAR_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(LINEAR_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); @@ -248,15 +214,14 @@ OpTaskSignature init_signature() { OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); - init.add_input_slot(WEIGHT); - init.add_input_slot(BIAS); + init.add_weight_slot(WEIGHT); init.add_output_slot(OUTPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); - return init, + return init; } template <> @@ -264,8 +229,8 @@ OpTaskSignature fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); - fwd.add_input_slot(WEIGHT); - fwd.add_input_slot(BIAS); + fwd.add_weight_slot(WEIGHT); + fwd.add_optional_weight_slot(BIAS); fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); @@ -281,13 +246,28 @@ OpTaskSignature bwd_signature() { return bwd; } +template <> +TaskImplFunction get_task_impl() { + return init_task_impl; +} + +template <> +TaskImplFunction get_task_impl() { + return forward_task_impl; +} + +template <> +TaskImplFunction get_task_impl() { + return backward_task_impl; +} + template <> void register_task() { register_task(LINEAR_INIT_TASK_ID, "Linear::init_task", init_signature(), - init_task); + init_task_impl); } template <> @@ -295,7 +275,7 @@ void register_task() { register_task(LINEAR_FWD_TASK_ID, "Linear::fwd_task", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -303,7 +283,11 @@ void register_task() { register_task(LINEAR_BWD_TASK_ID, "Linear::bwd_task", bwd_signature(), - backward_task); + backward_task_impl); +} + +std::vector get_task_ids(LinearAttrs const &) { + return {LINEAR_INIT_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID}; } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/linear.h b/lib/local-execution/src/ops/linear.h similarity index 98% rename from lib/runtime/src/ops/linear.h rename to lib/local-execution/src/ops/linear.h index 2b476382ef..2ff9016114 100644 --- a/lib/runtime/src/ops/linear.h +++ b/lib/local-execution/src/ops/linear.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LINEAR_H #define _FLEXFLOW_LINEAR_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/linear.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc similarity index 95% rename from lib/runtime/src/ops/noop.cc rename to lib/local-execution/src/ops/noop.cc index 6b8510607a..168d547c17 100644 --- a/lib/runtime/src/ops/noop.cc +++ b/lib/local-execution/src/ops/noop.cc @@ -14,7 +14,7 @@ */ #include "noop.h" -#include "task_spec/op_task_invocation.h" +#include "local-execution/op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/noop.h b/lib/local-execution/src/ops/noop.h similarity index 87% rename from lib/runtime/src/ops/noop.h rename to lib/local-execution/src/ops/noop.h index f5cf6cc98c..fab2cf1f86 100644 --- a/lib/runtime/src/ops/noop.h +++ b/lib/local-execution/src/ops/noop.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_NOOP_H #define _FLEXFLOW_NOOP_H +#include "local-execution/op_task_invocation.h" #include "op-attrs/ops/input.h" #include "op-attrs/ops/noop.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/parallel_op.h b/lib/local-execution/src/ops/parallel_op.h similarity index 96% rename from lib/runtime/src/ops/parallel_op.h rename to lib/local-execution/src/ops/parallel_op.h index 6b596a4fb5..e7bd98b8a8 100644 --- a/lib/runtime/src/ops/parallel_op.h +++ b/lib/local-execution/src/ops/parallel_op.h @@ -7,7 +7,7 @@ namespace FlexFlow { struct ParallelOpJoinResult { - optional op = nullopt; + std::optional op = std::nullopt; bool join_did_succeed = false; }; diff --git a/lib/runtime/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc similarity index 61% rename from lib/runtime/src/ops/partition.cc rename to lib/local-execution/src/ops/partition.cc index 2a974e96da..4b09ad026b 100644 --- a/lib/runtime/src/ops/partition.cc +++ b/lib/local-execution/src/ops/partition.cc @@ -13,32 +13,13 @@ * limitations under the License. */ -#include "parallel_ops/partition.h" #include "kernels/partition_kernels.h" -#include "op-attrs/get_output_shape.h" -#include "utils/exceptions.h" +#include "op-attrs/get_output_shapes.h" +#include "repartition.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Repartition; @@ -59,7 +40,7 @@ OpTaskInvocation forward(RepartitionAttrs const &attrs) { binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); binding.bind_arg(PER_DEVICE_STATE, - per_device_state()); + per_device_op_state()); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -79,64 +60,39 @@ static DeviceSpecific // Note: use the input data type DeviceSpecific per_device_state = - acc.create_device_specific_state( - init_kernel(handle, input.data_type)); + init_kernel(handle, input.data_type); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - return profiling(forward, - profiling, - "[Reparition/Partition] forward_time = %.2lfms\n", - per_device_state, - input, - output); -} - -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); + return profile(forward_kernel, + profiling, + "[Reparition/Partition] forward_time = {:.2lf}ms\n", + per_device_state, + input, + output); } -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); - return profiling(backward, - profiling, - "[Reparition/Partition] backward_time = %.2lfms\n", - per_device_state, - input_grad, - output_grad); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + return profile(backward_kernel, + profiling, + "[Reparition/Partition] backward_time = {:.2lf}ms\n", + per_device_state, + output_grad, + input_grad); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -144,7 +100,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); @@ -165,8 +121,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_accessor(REPARTITION_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_accessor(REPARTITION_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = + env.get_fwd_accessor(REPARTITION_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = + env.get_bwd_accessor(REPARTITION_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); @@ -185,7 +143,8 @@ void register_task() { init.add_return_value(); - register_task(REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task); + register_task( + REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task_impl); } template <> @@ -197,15 +156,19 @@ void register_task() { fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - register_task(REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task); + register_task( + REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID)); + +// register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc similarity index 57% rename from lib/runtime/src/ops/pool_2d.cc rename to lib/local-execution/src/ops/pool_2d.cc index 577837c960..989f390380 100644 --- a/lib/runtime/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -1,10 +1,10 @@ #include "pool_2d.h" #include "kernels/pool_2d_kernels.h" -#include "legion/legion_utilities.h" + #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/pool_2d.h" #include "utils/exception.decl.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Pool2D; @@ -23,13 +23,13 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) { return {POOL2D_INIT_TASK_ID, binding}; } -static DeviceSpecific +static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); int input_w = input.shape.at(ff_dim_t(0)) + 1; int input_h = input.shape.at(ff_dim_t(1)) + 1; @@ -64,37 +64,27 @@ static DeviceSpecific printf("Warning: changing pool_padding_w to satisfy output_w size\n"); } - DeviceSpecific state = acc.create_device_specific( - init_kernel(handle, - attrs.activation, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - pad_h, - pad_w, - attrs.kernel_h, - attrs.kernel_w, - attrs.stride_h, - attrs.stride_w, - attrs.pool_type); + DeviceSpecific state = init_kernel(handle, + attrs.activation, + input_w, + input_h, + input_c, + input_n, + output_w, + output_h, + output_c, + output_n, + pad_h, + pad_w, + attrs.kernel_h, + attrs.kernel_w, + attrs.stride_h, + attrs.stride_w, + attrs.pool_type); return state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - OpTaskInvocation forward(Pool2DAttrs const &attrs) { OpTaskBinding binding; binding.bind(INPUT, input_tensor(0)); @@ -102,54 +92,47 @@ OpTaskInvocation forward(Pool2DAttrs const &attrs) { binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state()); return {POOL2D_FWD_TASK_ID, binding}; } -OpTaskInvocation backward(Pool2DAttrs const &) { +OpTaskInvocation backward(Pool2DAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); return {POOL2D_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - Pool2dPerDeviceState state = - acc.get_argument(PER_DEVICE_STATE); + Pool2DPerDeviceState state = + acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, - profilng, - "[Pool2D] forward_time = %.2lfms\n", + profiling, + "[Pool2D] forward_time = {:.2lf}ms\n", state, input.get_float_ptr(), output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - Pool2dPerDeviceState state = - acc.get_argument(PER_DEVICE_STATE); + Pool2DPerDeviceState state = + acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto output_grad = acc.get_tensor(OUTPUT); return profile(backward_kernel, - profilng, - "[Pool2D] backward_time = %.2lfms\n", + profiling, + "[Pool2D] backward_time = {:.2lf}ms\n", state, input.get_float_ptr(), input_grad.get_float_ptr(), @@ -157,20 +140,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, Pool2DAttrs const &attrs, - ParallelTensorShape const &input, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; @@ -181,21 +156,21 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, auto init_accessor = env.get_init_accessor(POOL2D_INIT_TASK_ID, init_binding); - DeviceSpecific per_device_state = + DeviceSpecific per_device_state = init_task_impl(init_accessor); SimTaskBinding fwd_binding; - fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(INPUT, input.shape); fwd_binding.bind(OUTPUT, output_shape); fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); - auto fwd_accessor = env.get_accessor(POOL2D_FWD_TASK_ID, fwd_binding); + auto fwd_accessor = env.get_fwd_accessor(POOL2D_FWD_TASK_ID, fwd_binding); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto bwd_accessor = env.get_accessor(POOL2D_BWD_TASK_ID, bwd_binding); + auto bwd_accessor = env.get_bwd_accessor(POOL2D_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); @@ -217,7 +192,7 @@ void register_task() { init.add_return_value(); - register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_taks); + register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_task_impl); } template <> @@ -228,17 +203,20 @@ void register_task() { fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(PER_DEVICE_STATE); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task); + register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID)); + +// register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/pool_2d.h b/lib/local-execution/src/ops/pool_2d.h similarity index 95% rename from lib/runtime/src/ops/pool_2d.h rename to lib/local-execution/src/ops/pool_2d.h index f8701f461e..0537e9f1c4 100644 --- a/lib/runtime/src/ops/pool_2d.h +++ b/lib/local-execution/src/ops/pool_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_POOL_2D_H #define _FLEXFLOW_POOL_2D_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/pool_2d.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(Pool2DAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, Pool2DAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input_shape, ProfilingSettings const &settings, MachineView const &machine_view); diff --git a/lib/runtime/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc similarity index 57% rename from lib/runtime/src/ops/reduce.cc rename to lib/local-execution/src/ops/reduce.cc index 2674dc4fef..98d1a6f522 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -1,27 +1,12 @@ #include "reduce.h" #include "kernels/reduce_kernels.h" -#include "legion/legion_utilities.h" -#include "op-attrs/get_output_shape.h" -#include "utils/exceptions.h" + +#include "op-attrs/get_output_shapes.h" +#include "utils/exception.h" #include "utils/hash-utils.h" #include "utils/type_traits_core.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reduce; @@ -35,7 +20,7 @@ enum Slots { HANDLE }; -OpTaskInvocation init(TransposeAttrs const &attrs) { +OpTaskInvocation init(ReduceAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(HANDLE, ff_handle()); @@ -54,42 +39,33 @@ static DeviceSpecific auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - OperatorType = attrs.op_type; + OperatorType op_type = attrs.op_type; // Note: How to set the reduction size? size_t reduction_size = input.shape.get_volume() / output.shape.get_volume(); DeviceSpecific per_device_state = - acc.create_device_specific(init_kernel( - handle, op_type, reduction_size, input.shape, output.shape)); + init_kernel(handle, op_type, reduction_size, input.shape, output.shape); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT) + OpTaskSignature init(OpTaskType::INIT); - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); init.add_return_value(); - register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); + register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task_impl); } // Note: forward_kernel only needs ReducePerDeviceState, input, output OpTaskInvocation forward(ReduceAttrs const &attrs) { OpTaskBinding binding; - bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); - bind.bind_arg(PROFILING, profiling_tensor()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + binding.bind_arg(PROFILING, profiling_settings()); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -97,7 +73,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) { return {REDUCE_FWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -107,31 +83,23 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Reduce] forward_time = %.2lfms\n", + "[Reduce] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FORWARD); + OpTaskSignature fwd(OpTaskType::FWD); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task); + register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task_impl); } OpTaskInvocation backward(ReduceAttrs const &attrs) { @@ -140,48 +108,44 @@ OpTaskInvocation backward(ReduceAttrs const &attrs) { return {REDUCE_BWD_TASK_ID, binding}; } -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, - "[Reduce] backward_time = %.2lfms\n", + "[Reduce] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} +// TODO: OpTaskSignature -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); - reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); -} +// register_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, +// backward_task_impl); +// } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReduceAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); + init_binding.bind_arg(HANDLE, ff_handle()); auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding); DeviceSpecific per_device_state = @@ -189,10 +153,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SimTaskBinding fwd_binding; ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - fwd.bind(INPUT, input.shape); - fwd.bind(OUTPUT, output_shape); - fwd.bind_arg(PROFILING, settings); - fwd.bind_arg(PER_DEVICE_STATE, per_device_state); + fwd_binding.bind(INPUT, input.shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); diff --git a/lib/runtime/src/ops/reduce.h b/lib/local-execution/src/ops/reduce.h similarity index 96% rename from lib/runtime/src/ops/reduce.h rename to lib/local-execution/src/ops/reduce.h index 099083ed67..6d47ec2f4d 100644 --- a/lib/runtime/src/ops/reduce.h +++ b/lib/local-execution/src/ops/reduce.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reduce.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc similarity index 59% rename from lib/runtime/src/ops/reduction.cc rename to lib/local-execution/src/ops/reduction.cc index 9a11d3a6f5..3fa300f64d 100644 --- a/lib/runtime/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -13,32 +13,14 @@ * limitations under the License. */ -#include "parallel_ops/reduction.h" +#include "reduction.h" #include "kernels/reduction_kernels.h" -#include "op-attrs/get_output_shape.h" -#include "utils/exceptions.h" +#include "op-attrs/get_output_shapes.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reduction; @@ -61,7 +43,7 @@ OpTaskInvocation backward(ReductionAttrs const &attrs) { return {REDUCTION_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling_settings = acc.get_argument(PROFILING); @@ -71,40 +53,25 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { size_t num_replicas = attrs.reduction_degree; - return profiling(forward_kernel, - profiling_settings, - "[Reduction] forward_time = %.2lfms\n", - input, - output, - num_replicas); + return profile(forward_kernel, + profiling_settings, + "[Reduction] forward_time = {:.2lf}ms\n", + input, + output, + num_replicas); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - return profiling(backward_kernel, - profiling, - "[Reduction] backward_time = %.2lfms\n", - input_grad, - output_grad); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + return profile(backward_kernel, + profiling, + "[Reduction] backward_time = {:.2lf}ms\n", + input_grad, + output_grad); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -114,13 +81,13 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, MachineView const &machine_view) { ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding fwd_binding; fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(ATTRS, attrs); fwd_binding.bind(INPUT, input.shape); - fwd.binding.bind(OUTPUT, output_shape); + fwd_binding.bind(OUTPUT, output_shape); auto fwd_accessor = env.get_fwd_accessor(REDUCTION_FWD_TASK_ID, fwd_binding); @@ -145,15 +112,18 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task); + register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID)); + +// register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reduction.h b/lib/local-execution/src/ops/reduction.h similarity index 94% rename from lib/runtime/src/ops/reduction.h rename to lib/local-execution/src/ops/reduction.h index 978ca6b080..a69b75f310 100644 --- a/lib/runtime/src/ops/reduction.h +++ b/lib/local-execution/src/ops/reduction.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REDUCTION_H #define _FLEXFLOW_REDUCTION_H -#include "op-attrs/ops/combine.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" +#include "op-attrs/ops/reduction.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/repartition.h b/lib/local-execution/src/ops/repartition.h similarity index 97% rename from lib/runtime/src/ops/repartition.h rename to lib/local-execution/src/ops/repartition.h index fccc0de7be..a73bd3f808 100644 --- a/lib/runtime/src/ops/repartition.h +++ b/lib/local-execution/src/ops/repartition.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_PARTITION_H #define _FLEXFLOW_PARTITION_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/repartition.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc similarity index 66% rename from lib/runtime/src/ops/replicate.cc rename to lib/local-execution/src/ops/replicate.cc index 1675a62c5f..a441985b78 100644 --- a/lib/runtime/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -13,39 +13,21 @@ * limitations under the License. */ -#include "parallel_ops/replicate.h" +#include "replicate.h" #include "kernels/replicate_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/graph/serialparallel.h" #include "utils/hash-utils.h" #include namespace FlexFlow { // declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Replicate; -enum Slots { INPUT, OUTPUT, PROFILING }; +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; OpTaskInvocation forward(ReplicateAttrs const &attrs) { OpTaskBinding binding; @@ -54,6 +36,7 @@ OpTaskInvocation forward(ReplicateAttrs const &attrs) { binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(ATTRS, attrs); return {REPLICATE_FWD_TASK_ID, binding}; } @@ -63,7 +46,7 @@ OpTaskInvocation backward(ReplicateAttrs const &attrs) { return {REPLICATE_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -71,38 +54,25 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[replicate] forward_time = %.2lfms\n", + "[replicate] forward_time = {:.2lf}ms\n", input, output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto const &attrs = acc.get_argument(ATTRS); return profile(backward_kernel, profiling, - "[replicate] backward_time = %.2lfms\n", + "[replicate] backward_time = {:.2lf}ms\n", input_grad, - output_grad); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + output_grad, + attrs.replicate_degree); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -110,7 +80,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding fwd_binding; fwd_binding.bind_arg(PROFILING, settings); ParallelTensorShape output = get_output_shape(attrs, input.shape); @@ -136,14 +106,18 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task); + register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID)); + +// register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/replicate.h b/lib/local-execution/src/ops/replicate.h similarity index 89% rename from lib/runtime/src/ops/replicate.h rename to lib/local-execution/src/ops/replicate.h index da2b71f098..339f805f2c 100644 --- a/lib/runtime/src/ops/replicate.h +++ b/lib/local-execution/src/ops/replicate.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REPLICATE_H #define _FLEXFLOW_REPLICATE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/replicate.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc similarity index 68% rename from lib/runtime/src/ops/reshape.cc rename to lib/local-execution/src/ops/reshape.cc index c9dc8cff8d..efee73645b 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -15,24 +15,10 @@ #include "reshape.h" #include "kernels/reshape_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" namespace FlexFlow { // declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reshape; @@ -69,74 +55,50 @@ static DeviceSpecific auto attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(attrs.shape.data_type)); + init_kernel(attrs.shape.data_type); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - Profiling profiling = acc.get_argument(PROFILING); + ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, - "[Reshape] forward time = %.2lfms\n", + "[Reshape] forward time = {:.2lf}ms\n", per_device_state, input, output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - Profiling profiling = acc.get_argument(PROFILING); + ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, - "[Reshape] backward time = %.2lfms\n", + "[Reshape] backward time = {:.2lf}ms\n", per_device_state, input_grad, output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReshapeAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { + auto env = sim_factory.new_environment(); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); auto init_accessor = @@ -168,9 +130,9 @@ void register_task() { init.add_arg_slot(ATTRS); - init.add_return_value(PER_DEVICE_STATE); + init.add_return_value(); - register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); + register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task_impl); } template <> @@ -183,15 +145,17 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task); + register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); + +// register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reshape.h b/lib/local-execution/src/ops/reshape.h similarity index 97% rename from lib/runtime/src/ops/reshape.h rename to lib/local-execution/src/ops/reshape.h index f044e3f057..14b22561a0 100644 --- a/lib/runtime/src/ops/reshape.h +++ b/lib/local-execution/src/ops/reshape.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RESHAPE_H #define _FLEXFLOW_RESHAPE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reshape.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc similarity index 67% rename from lib/runtime/src/ops/reverse.cc rename to lib/local-execution/src/ops/reverse.cc index ac64146cd1..7fefb3d357 100644 --- a/lib/runtime/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -19,23 +19,9 @@ #include "op-attrs/get_output_shapes.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reverse; +using coord_t = long long; enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; @@ -43,7 +29,7 @@ OpTaskInvocation forward(ReverseAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -56,28 +42,28 @@ OpTaskInvocation backward(ReverseAttrs const &attrs) { return {REVERSE_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int output_size = outtput.shape.get_volume(); + int output_size = output.shape.get_volume(); auto axis = attrs.axis; coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < output.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= output.shape[i]; + in_blk_size *= output.shape.at(ff_dim_t(i)); } else if (i == axis) { - reverse_dim_size = output.shape[i]; + reverse_dim_size = output.shape.at(ff_dim_t(i)); } else { - num_out_blks *= output.shape[i]; + num_out_blks *= output.shape.at(ff_dim_t(i)); } } return profile(forward_kernel, profiling, - "[reverse] forward_time = %.2lfms\n", + "[reverse] forward_time = {:.2lf}ms\n", input.get_float_ptr(), output.get_float_ptr(), num_out_blks, @@ -86,49 +72,34 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output_size); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int axis = input.shape.get_dim() - attrs.axis - 1; + int axis = input_grad.shape.get_dim() - attrs.axis.value() - 1; coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < input_grad.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= input_grad.shape[i]; + in_blk_size *= input_grad.shape.at(ff_dim_t(i)); } else if (i == axis) { - reverse_dim_size = input_grad.shape[i]; + reverse_dim_size = input_grad.shape.at(ff_dim_t(i)); } else { - num_out_blks *= input_grad.shape[i]; + num_out_blks *= input_grad.shape.at(ff_dim_t(i)); } } return profile(backward_kernel, profiling, - "[reverse] backward_time = %.2lfms\n", + "[reverse] backward_time = {:.2lf}ms\n", output_grad.get_float_ptr(), input_grad.get_float_ptr(), num_out_blks, reverse_dim_size, in_blk_size, - input.shape.get_volume()); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + input_grad.shape.get_volume()); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -136,7 +107,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding fwd_binding; @@ -161,21 +132,23 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, } template <> -void register_task()) { +void register_task() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task); + register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID)); - register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, backward_task); -} +// TODO: OpTaskSignature +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID)); +// register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reverse.h b/lib/local-execution/src/ops/reverse.h similarity index 89% rename from lib/runtime/src/ops/reverse.h rename to lib/local-execution/src/ops/reverse.h index af4d335429..5be501698c 100644 --- a/lib/runtime/src/ops/reverse.h +++ b/lib/local-execution/src/ops/reverse.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REVERSE_H_ #define _FLEXFLOW_REVERSE_H_ +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reverse.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc similarity index 68% rename from lib/runtime/src/ops/softmax.cc rename to lib/local-execution/src/ops/softmax.cc index b67f9730a4..ea857c680b 100644 --- a/lib/runtime/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -17,26 +17,10 @@ #include "kernels/softmax_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - using namespace FlexFlow::Kernels::Softmax; enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE }; @@ -75,21 +59,11 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, attrs.dim)); + init_kernel(handle, attrs.dim.value()); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -98,21 +72,14 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[SoftMax] forward_time = %.2lfms\n", + "[SoftMax] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), - output.get_float_ptr(), ); + output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -124,22 +91,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { assert(output_grad.shape == output.shape); - return profile( - backward_kernel, - profiling, - "[SoftMax] backward_time = %.2lfms\n", - input_grad.get_float_ptr(), - output_grad.get_float_ptr(), - output_grad.shape.volume(), // Note(lambda): get num_elements, maybe wrong - ); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + return profile(backward_kernel, + profiling, + "[SoftMax] backward_time = {:.2lf}ms\n", + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + output_grad.shape.get_volume()); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -147,7 +104,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); @@ -162,7 +119,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, init_task_impl(init_accessor); SimTaskBinding fwd_binding; - ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); fwd_binding.bind(INPUT, input.shape); fwd_binding.bind(OUTPUT, output_shape); fwd_binding.bind_arg(PROFILING, settings); @@ -186,9 +142,9 @@ void register_task() { init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); - init.add_return_value_slot(); + init.add_return_value(); - register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task); + register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task_impl); } template <> @@ -201,15 +157,17 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task); + register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID)); + +// register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/softmax.h b/lib/local-execution/src/ops/softmax.h similarity index 97% rename from lib/runtime/src/ops/softmax.h rename to lib/local-execution/src/ops/softmax.h index 06b9d09d60..a83d8f4116 100644 --- a/lib/runtime/src/ops/softmax.h +++ b/lib/local-execution/src/ops/softmax.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SOFTMAX_H #define _FLEXFLOW_SOFTMAX_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/softmax.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/split.cc b/lib/local-execution/src/ops/split.cc similarity index 58% rename from lib/runtime/src/ops/split.cc rename to lib/local-execution/src/ops/split.cc index 2af5d42874..13e95d37f9 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -16,28 +16,14 @@ #include "split.h" #include "kernels/array_shape.h" #include "kernels/split_kernels.h" -#include "utils/exceptions.h" +#include "op-attrs/get_output_shapes.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; using namespace FlexFlow::Kernels::Split; +using coord_t = long long; enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; @@ -58,96 +44,86 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { return {SPLIT_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +void calc_block_size(coord_t &num_blocks, + coord_t &block_size, + ArrayShape const &array_shape, + int axis) { + num_blocks = 1; + block_size = 1; + for (int d = 0; d < array_shape.num_elements(); d++) { + if (d <= axis) { + block_size *= array_shape.at(legion_dim_t(d)); + } else { + num_blocks *= array_shape.at(legion_dim_t(d)); + } + } +} + +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis); + coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS]; + calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis.value()); for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blks; + coord_t out_num_blocks; calc_block_size( - out_num_blks, out_blk_size[i], output.shape, split->legion_axis); + out_num_blocks, out_block_size[i], output.shape, attrs.axis.value()); } + float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, profiling, - "Split forward_time = %.2lfms\n", - &output.get_float_ptr(), + "Split forward_time = {:.2lf}ms\n", + &output_float_ptr, input.get_float_ptr(), - out_blk_size, - in_blk_size, - num_blks, + out_block_size, + in_block_size, + num_blocks, attrs.splits.size()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - // maybe we should add assert like the original code -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, input_grade.shape, attrs.axis); + coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS]; + calc_block_size( + num_blocks, in_block_size, input_grad.shape, attrs.axis.value()); for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blks; - calc_block_size( - out_num_blks, out_blk_size[i], output_grad.shape, split->legion_axis); + coord_t out_num_blocks; + calc_block_size(out_num_blocks, + out_block_size[i], + output_grad.shape, + attrs.axis.value()); } + float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, profiling, - "Split backward_time = %.2lfms\n", + "Split backward_time = {:.2lf}ms\n", input_grad.get_float_ptr(), - &output_grad.get_float_ptr(), - out_blk_size, - in_blk_size, - num_blks, + &output_grad_ptr, + out_block_size, + in_block_size, + num_blocks, attrs.splits.size()); } -void calc_block_size(coord_t &num_blks, - coord_t &blk_size, - ArrayShape const &array_shape, - int axis) { - num_blks = 1; - blk_size = 1; - for (int d = 0; d < array_shape.get_dim(); d++) { - if (d <= axis) { - blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); - blk_size *= array_shape.at(legion_dim_t(d)) + 1 - } else { - num_blks *= array_shape.at(legion_dim_t(d)) + 1 - } - } -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); - ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + std::vector output_shape = + get_output_shapes(attrs, input.shape); SimTaskBinding fwd_binding; fwd_binding.bind(INPUT, input.shape); @@ -166,6 +142,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, return make_metrics(forward_time, backward_time, sync_time, env); } +// TODO: OpTaskSignature + template <> void register_task() { OpTaskSignature fwd(OpTaskType::FWD); @@ -174,15 +152,15 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task); + register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); - register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task); -} +// register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/split.h b/lib/local-execution/src/ops/split.h similarity index 93% rename from lib/runtime/src/ops/split.h rename to lib/local-execution/src/ops/split.h index d63212e836..f51e0ea6af 100644 --- a/lib/runtime/src/ops/split.h +++ b/lib/local-execution/src/ops/split.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SPLIT_H #define _FLEXFLOW_SPLIT_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/split.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, - InputParallelTensorDes const &input, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); diff --git a/lib/runtime/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc similarity index 60% rename from lib/runtime/src/ops/topk.cc rename to lib/local-execution/src/ops/topk.cc index 958516a6d9..8aceb9c6d4 100644 --- a/lib/runtime/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -16,28 +16,9 @@ #include "topk.h" #include "kernels/topk_kernels.h" #include "op-attrs/get_output_shapes.h" -#include "utils/exceptions.h" +#include "utils/exception.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; using namespace FlexFlow::Kernels::TopK; @@ -50,7 +31,7 @@ enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE }; OpTaskInvocation init(TopKAttrs const &attrs) { OpTaskBinding binding; - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); return {TOPK_INIT_TASK_ID, binding}; } @@ -60,7 +41,7 @@ OpTaskInvocation forward(TopKAttrs const &attrs) { binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -81,23 +62,14 @@ static DeviceSpecific auto attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific(init_kernel(attrs.sorted)); + init_kernel(attrs.sorted); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto per_device_state = - acc.get_device_specific(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); auto profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -107,31 +79,24 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { size_t batch_size = input.shape.get_volume() / length; auto indices = acc.get_tensor(INDICES); - return profiling(forward_kernel, - profiling, - "[TopK] forward_time = %.2lfms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - indices.get_int32_ptr(), - batch_size, - length, - attrs.k, - attrs.sorted); + return profile(forward_kernel, + profiling, + "[TopK] forward_time = {:.2lf}ms\n", + per_device_state, + input.get_float_ptr(), + output.get_float_ptr(), + indices.get_int32_ptr(), + batch_size, + length, + attrs.k, + attrs.sorted); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto per_device_state = - acc.get_device_specific(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); auto profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -139,27 +104,19 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto indices = acc.get_tensor(INDICES); - int length = input.shape.at(legion_dim_t(0)) + 1; - size_t batch_size = input.shape.get_volume() / length; - - return profiling(backward_kernel, - profiling, - "[TopK] backward_time = %.2lfms\n", - per_device_state, - output_grad.get_float_ptr(), - indices.get_int32_ptr(), - input_grad.get_float_ptr(), - batch_size, - length, - attrs.k); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + int length = input_grad.shape.at(legion_dim_t(0)) + 1; + size_t batch_size = input_grad.shape.get_volume() / length; + + return profile(backward_kernel, + profiling, + "[TopK] backward_time = {:.2lf}ms\n", + per_device_state, + output_grad.get_float_ptr(), + indices.get_int32_ptr(), + input_grad.get_float_ptr(), + batch_size, + length, + attrs.k); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -167,9 +124,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); - ParallelTensorShape output_shape = get_output_shapes(attrs, input.shape); + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); @@ -204,7 +161,7 @@ void register_task() { init.add_arg_slot(ATTRS); // Note: this may have some question init.add_return_value(); - register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task); + register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task_impl); } template <> @@ -219,14 +176,17 @@ void register_task() { fwd.add_output_slot(OUTPUT); fwd.add_output_slot(INDICES); - register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task); + register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID)); + +// register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/topk.h b/lib/local-execution/src/ops/topk.h similarity index 97% rename from lib/runtime/src/ops/topk.h rename to lib/local-execution/src/ops/topk.h index f15ff6de81..db85fd9d03 100644 --- a/lib/runtime/src/ops/topk.h +++ b/lib/local-execution/src/ops/topk.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TOPK_H_ #define _FLEXFLOW_TOPK_H_ +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/topk.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc similarity index 55% rename from lib/runtime/src/ops/transpose.cc rename to lib/local-execution/src/ops/transpose.cc index ea6182772f..c998484455 100644 --- a/lib/runtime/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -15,27 +15,10 @@ #include "transpose.h" #include "kernels/transpose_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/transpose.h" #include "utils/exception.decl.h" -namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - using namespace FlexFlow::Kernels::Transpose; namespace FlexFlow { @@ -57,33 +40,26 @@ OpTaskInvocation init(TransposeAttrs const &attrs) { static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); - std::vector perm = attrs.perm; // default convert stack_vector to vector + std::vector perm = static_cast>(attrs.perm); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(perm.size(), perm)); + init_kernel(perm.size(), perm); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} +// TODO: OpTaskSignature -template <> -void register_task(); -OpTaskSignature init(OpTaskType::INIT) +// template <> +// void register_task() { +// OpTaskSignature init(OpTaskType::INIT); - init.add_arg_slot(ATTRS); +// init.add_arg_slot(ATTRS); -init.add_return_value(); +// init.add_return_value(); -register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init, init_task); -} // namespace FlexFlow +// register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init, +// init_task_impl); +// } OpTaskInvocation forward(TransposeAttrs const &attrs) { OpTaskBinding binding; @@ -92,13 +68,13 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) { per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); - bind.bind(INPUT, input_tensor(0)); - bind.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); return {TRANSPOSE_FWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -106,47 +82,32 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - return profiling(forward_kernel, - profiling, - "[Transpose] Forward_time = %.2lf [ms]", - per_device_state, - input, - output); + return profile(forward_kernel, + profiling, + "[Transpose] Forward_time = {:.2lf} [ms]", + per_device_state, + input, + output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = - acc.get_per_device_state(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); - return profiling(backward_kernel, - profiling, - "[Transpose] Backward_time = %.2lf [ms]", - per_device_state, - input_grad, - output_grad); + return profile(backward_kernel, + profiling, + "[Transpose] Backward_time = {:.2lf} [ms]", + per_device_state, + input_grad, + output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - -OpTaskInvocation backward(TransposeAttrs const &) { +OpTaskInvocation backward(TransposeAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); return {TRANSPOSE_BWD_TASK_ID, binding}; @@ -159,7 +120,7 @@ CostMetrics &input_descs, // Note:this may have some problem ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); @@ -169,12 +130,13 @@ CostMetrics DeviceSpecific per_device_state = init_task_impl(init_accessor); - ParallelTensorShape output_shape = get_output_shape(attrs, input_descs.shape); + ParallelTensorShape output_shape = + get_output_shape(attrs, input_descs.shapes); SimTaskBinding fwd_binding; fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); fwd_binding.bind_arg(PROFILING, settings); - fwd_binding.bind(INPUT, input_descs.shape); + fwd_binding.bind(INPUT, input_descs.shapes); fwd_binding.bind(OUTPUT, output_shape); auto fwd_accessor = env.get_fwd_accessor(TRANSPOSE_FWD_TASK_ID, fwd_binding); @@ -189,4 +151,4 @@ CostMetrics return make_metrics(forward_time, backward_time, sync_time, env); } -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/lib/runtime/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h similarity index 97% rename from lib/runtime/src/ops/transpose.h rename to lib/local-execution/src/ops/transpose.h index 52e824ebbf..daa64e8e59 100644 --- a/lib/runtime/src/ops/transpose.h +++ b/lib/local-execution/src/ops/transpose.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TRANSPOSE_H_ #define _FLEXFLOW_TRANSPOSE_H_ +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/transpose.h" -#include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/permissions.cc b/lib/local-execution/src/permissions.cc similarity index 66% rename from lib/runtime/src/permissions.cc rename to lib/local-execution/src/permissions.cc index 2992780ae1..e5c46b42f8 100644 --- a/lib/runtime/src/permissions.cc +++ b/lib/local-execution/src/permissions.cc @@ -1,38 +1,8 @@ -#include "permissions.h" +#include "local-execution/permissions.h" #include "utils/exception.h" namespace FlexFlow { -Legion::PrivilegeMode to_legion(Permissions p) { - switch (p) { - case Permissions::NONE: - return LEGION_NO_ACCESS; - case Permissions::RO: - return LEGION_READ_ONLY; - case Permissions::WO: - return LEGION_WRITE_ONLY; - case Permissions::RW: - return LEGION_READ_WRITE; - default: - throw mk_runtime_error("Unknown permission {}", static_cast(p)); - } -} - -optional from_legion(Legion::PrivilegeMode p) { - switch (p) { - case LEGION_NO_ACCESS: - return Permissions::NONE; - case LEGION_READ_ONLY: - return Permissions::RO; - case LEGION_WRITE_ONLY: - return Permissions::WO; - case LEGION_READ_WRITE: - return Permissions::RW; - default: - return nullopt; - } -} - Permissions join(Permissions lhs, Permissions rhs) { if (lhs <= rhs) { return rhs; diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc similarity index 55% rename from lib/runtime/src/task_spec/runtime_arg_ref.cc rename to lib/local-execution/src/runtime_arg_ref.cc index a0aa242ce6..df4f024f1d 100644 --- a/lib/runtime/src/task_spec/runtime_arg_ref.cc +++ b/lib/local-execution/src/runtime_arg_ref.cc @@ -1,5 +1,5 @@ -#include "runtime_arg_ref.h" -#include "device_specific.h" +#include "local-execution/runtime_arg_ref.h" +#include "local-execution/device_specific.h" namespace FlexFlow { @@ -11,4 +11,8 @@ RuntimeArgRef> ff_handle() { return {RuntimeArgRefType::FF_HANDLE}; } +RuntimeArgRef> iteration_config() { + return {RuntimeArgRefType::FF_ITERATION_CONFIG}; +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 6d06714252..68636906c3 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -1,4 +1,4 @@ -#include "tracked_allocator.h" +#include "local-execution/tracked_allocator.h" #include "kernels/device.h" namespace FlexFlow { diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc new file mode 100644 index 0000000000..efd43a6648 --- /dev/null +++ b/lib/local-execution/src/variadic_tensor_ref.cc @@ -0,0 +1,9 @@ +#include "local-execution/variadic_tensor_ref.h" + +namespace FlexFlow { + +VariadicTensorRef get_input_tensors() { + return {VariadicTensorRefType::INPUT_TENSORS}; +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/get_output_shapes.h b/lib/op-attrs/include/op-attrs/get_output_shapes.h index 6fb93aac91..496cfbb755 100644 --- a/lib/op-attrs/include/op-attrs/get_output_shapes.h +++ b/lib/op-attrs/include/op-attrs/get_output_shapes.h @@ -128,9 +128,7 @@ ParallelTensorShape get_output_shape(DropoutAttrs const &, ParallelTensorShape get_output_shape(ElementBinaryAttrs const &, ParallelTensorShape const &, ParallelTensorShape const &); -ParallelTensorShape get_output_shape(ElementUnaryAttrs const &, - ParallelTensorShape const &); -ParallelTensorShape get_output_shape(ElementScalarUnaryAttrs const &, +ParallelTensorShape get_output_shape(ElementUnaryUnifiedAttrs const &, ParallelTensorShape const &); ParallelTensorShape get_output_shape(EmbeddingAttrs const &, ParallelTensorShape const &); @@ -153,6 +151,8 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &, ParallelTensorShape const &); ParallelTensorShape get_output_shape(ReplicateAttrs const &, ParallelTensorShape const &); +ParallelTensorShape get_output_shape(ReshapeAttrs const &, + ParallelTensorShape const &); ParallelTensorShape get_output_shape(ReverseAttrs const &, ParallelTensorShape const &); std::vector get_output_shapes(SplitAttrs const &, diff --git a/lib/op-attrs/include/op-attrs/ops/element_unary.h b/lib/op-attrs/include/op-attrs/ops/element_unary.h index 5e19b81c8c..6a80094dfa 100644 --- a/lib/op-attrs/include/op-attrs/ops/element_unary.h +++ b/lib/op-attrs/include/op-attrs/ops/element_unary.h @@ -21,6 +21,9 @@ struct ElementScalarUnaryAttrs { FF_VISITABLE_STRUCT(ElementScalarUnaryAttrs, op_type, scalar); CHECK_VALID_OP_ATTR(ElementScalarUnaryAttrs); +using ElementUnaryUnifiedAttrs = + std::variant; + } // namespace FlexFlow #endif diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h index a46df59282..3b57a959b8 100644 --- a/lib/op-attrs/include/op-attrs/ops/linear.h +++ b/lib/op-attrs/include/op-attrs/ops/linear.h @@ -36,7 +36,11 @@ CHECK_VALID_OP_ATTR(LinearAttrs); TensorShape get_weights_shape(LinearAttrs const &attrs, TensorShape const &input); +ParallelTensorShape get_weights_shape(LinearAttrs const &attrs, + ParallelTensorShape const &input); TensorShape get_bias_shape(LinearAttrs const &attrs, TensorShape const &input); +ParallelTensorShape get_bias_shape(LinearAttrs const &attrs, + ParallelTensorShape const &input); } // namespace FlexFlow diff --git a/lib/runtime/include/runtime/task_spec/concrete_arg.h b/lib/runtime/include/runtime/task_spec/concrete_arg.h deleted file mode 100644 index 1d973eb81a..0000000000 --- a/lib/runtime/include/runtime/task_spec/concrete_arg.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H -#define _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H - -#include "arg_type_runtime_tag.h" -#include "utils/type_index.h" -#include - -namespace FlexFlow { - -struct ConcreteArgSpec { -public: - ConcreteArgSpec() = delete; - - template - T const &get() { - assert(this->type_tag.matches()); - - return *(T const *)ptr.get(); - } - - ArgTypeRuntimeTag get_type_tag() const { - return this->type_tag; - } - size_t serialize(Legion::Serializer &) const; - - template - static ConcreteArgSpec create(T const &t) { - static_assert(is_serializable::value, "Type must be serializable"); - - return ConcreteArgSpec(type_index(), - std::make_shared(t), - ArgTypeRuntimeTag::create()); - } - -private: - ConcreteArgSpec(std::type_index, - std::shared_ptr, - ArgTypeRuntimeTag const &); - - ArgTypeRuntimeTag type_tag; - std::shared_ptr ptr; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/runtime/src/ops/gather.cc b/lib/runtime/src/ops/gather.cc deleted file mode 100644 index 9ef53ffc6a..0000000000 --- a/lib/runtime/src/ops/gather.cc +++ /dev/null @@ -1,416 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "gather.h" -#include "embedding.h" -#include "kernels/gather_kernels.h" -#include "legion/legion_utilities.h" - -namespace FlexFlow { - -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; - -using namespace FlexFlow::Kernels::Gather; - -GatherParams Gather::get_params() const { - GatherParams params; - params.legion_dim = this->legion_dim; - params.layer_guid = this->layer_guid; - return params; -} - -Tensor FFModel::gather(const Tensor input, - const Tensor index, - int dim, - char const *name) { - Layer *gather = new Layer(this, - OP_GATHER, - DT_FLOAT, - name, - 2 /*inputs*/, - 0 /*weights*/, - 1 /*output*/, - input, - index); - assert(index->data_type == DT_INT32 || index->data_type == DT_INT64); - assert(input->num_dims == index->num_dims); - int legion_dim = input->num_dims - 1 - dim; - // https://pytorch.org/docs/stable/generated/torch.gather.html - // Currently we assume index.size(d) == input.size(d) for all - // dimensions d != dim, which is a stronger constraint that PyTorch's - for (int i = 0; i < input->num_dims; i++) { - if (i != legion_dim) { - assert(input->dims[i] == index->dims[i]); - } - } - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < index->num_dims; i++) { - dims[i] = index->dims[i]; - } - gather->outputs[0] = create_tensor_legion_ordering( - index->num_dims, dims, input->data_type, gather, 0, true /*create_grad*/); - gather->add_int_property("legion_dim", legion_dim); - layers.push_back(gather); - return gather->outputs[0]; -} - -Op *Gather::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("legion_dim", value); - int legion_dim = value; - return new Gather( - model, layer->layer_guid, inputs[0], inputs[1], legion_dim, layer->name); -} - -Gather::Gather(FFModel &model, - GatherParams const ¶ms, - std::pair const &inputs, - char const *name) - : Gather(model, - params.layer_guid, - inputs.first, - inputs.second, - params.legion_dim, - name) {} - -Gather::Gather(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor input, - const ParallelTensor index, - int _legion_dim, - char const *name) - : Op(model, - OP_GATHER, - input->data_type, - name, - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - index), - legion_dim(_legion_dim) { - layer_guid = _layer_guid; - // Assume that input and index have the same paralleldim except - // for the legion_dim-th dim, which cannot be parallelized - for (int i = 0; i < input->num_dims; i++) { - if (i != legion_dim) { - assert(input->dims[i] == index->dims[i]); - } - } - assert(index->dims[legion_dim].degree == 1); - assert(input->dims[legion_dim].degree == 1); - // output has the same parallel dims as index - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < index->num_dims; i++) { - dims[i] = index->dims[i]; - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - index->num_dims, dims, input->data_type, this); -} - -void Gather::serialize(Legion::Serializer &sez) const { - GatherParams params = get_params(); - sez.serialize(params.legion_dim); - sez.serialize(this->layer_guid.id); -} - -using PCG::Node; -/*static*/ -Node Gather::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 2); - int legion_dim; - dez.deserialize(legion_dim); - size_t id; - dez.deserialize(id); - LayerID layer_guid(id); - - GatherParams params; - params.legion_dim = legion_dim; - params.layer_guid = layer_guid; - return ff.get_or_create_node({inputs[0], inputs[1]}, params); -} - -Op *Gather::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - GatherParams params = get_params(); - return new Gather(ff, params, {inputs[0], inputs[1]}, this->name); -} - -void Gather::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(GATHER_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Gather)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -PerDeviceOpState *Gather::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - Gather const *gather = (Gather const *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - GatherMeta *m = new GatherMeta(handle, gather); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - assert(input.domain.get_dim() == index.domain.get_dim()); - assert(output.domain.get_dim() == index.domain.get_dim()); - for (int i = 0; i < input.domain.get_dim(); i++) { - assert(index.domain.hi()[i] == output.domain.hi()[i]); - assert(index.domain.lo()[i] == output.domain.lo()[i]); - if (i != m->legion_dim) { - assert(input.domain.hi()[i] == index.domain.hi()[i]); - assert(input.domain.lo()[i] == index.domain.lo()[i]); - } - } - return m; -} - -void Gather::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(GATHER_FWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, false), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} - -void Gather::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - GatherMeta const *m = *((GatherMeta **)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input, index, output); -} - -void Gather::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(GATHER_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(2, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} - -void Gather::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - GatherMeta const *m = *((GatherMeta **)task->local_args); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad, index, input_grad); -} - -bool Gather::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_index, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - if (!inputs[1]->get_sub_tensor(mv, sub_index)) { - return false; - } - GatherMeta *m = new GatherMeta(sim->handler, this); - sim->free_all(); - bool out_of_memory = false; - Domain input_domain = sub_input.get_domain(); - void *input_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW input_acc( - inputs[0]->data_type, input_domain, input_ptr); - Domain index_domain = sub_index.get_domain(); - void *index_ptr = sim->allocate(sub_index.get_volume(), inputs[1]->data_type); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW index_acc( - inputs[1]->data_type, index_domain, index_ptr); - out_of_memory = out_of_memory || (input_ptr == NULL) || (index_ptr == NULL); - Domain out_domain = sub_output.get_domain(); - void *output_ptr = - sim->allocate(sub_output.get_volume(), outputs[0]->data_type); - out_of_memory = out_of_memory || (output_ptr == NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW output_acc( - outputs[0]->data_type, out_domain, output_ptr); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(m, input_acc, index_acc, output_acc); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - backward = [&] { - backward_kernel_wrapper(m, output_acc, index_acc, input_acc); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure Gather] name(%s) forward_time(%.4lf) " - "backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Gather] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } - delete m; - return true; -} - -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::GatherParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.legion_dim); - hash_combine(key, params.layer_guid.id); - return key; -} -}; // namespace std diff --git a/lib/runtime/src/ops/gather.h b/lib/runtime/src/ops/gather.h deleted file mode 100644 index 1ea20b71f5..0000000000 --- a/lib/runtime/src/ops/gather.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef _FLEXFLOW_OPS_GATHER_H -#define _FLEXFLOW_OPS_GATHER_H - -#include "op-attrs/ops/gather.h" -#include "op_task_invocation.h" -#include "sim_environment.h" - -namespace FlexFlow { - -template <> -void register_task(); -template <> -void register_task(); -template <> -void register_task(); - -OpTaskInvocation init(GatherAttrs const &); -OpTaskInvocation forward(GatherAttrs const &); -OpTaskInvocation backward(GatherAttrs const &); - -CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, - GatherAttrs const &attrs, - ParallelTensorShape const &input_shape, - ParallelTensorShape const &index_shape, - ProfilingSettings const &settings, - MachineView const &machine_view); - -/* class Gather : public Op { */ -/* public: */ -/* Gather(FFModel &model, */ -/* ParallelTensor const &input, */ -/* ParallelTensor const &index, */ -/* int legion_dim, */ -/* char const *name = nullptr); */ -/* void init(FFModel const &) override; */ -/* void forward(FFModel const &) override; */ -/* void backward(FFModel const &) override; */ - -/* static Op * */ -/* create_operator_from_layer(FFModel &model, */ -/* Layer const *layer, */ -/* std::vector const &inputs); - */ - -/* static PerDeviceOpState *init_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void forward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void backward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* bool measure_operator_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ -/* void serialize(Legion::Serializer &s) const override; */ -/* /1* static PCG::Node deserialize(FFModel &ff, *1/ */ -/* /1* Legion::Deserializer &d, *1/ */ -/* /1* ParallelTensor inputs[], *1/ */ -/* /1* int num_inputs); *1/ */ -/* Op *materialize(FFModel &ff, */ -/* ParallelTensor inputs[], */ -/* int num_inputs) const override; */ - -/* public: */ -/* int legion_dim; */ -/* }; */ - -} // namespace FlexFlow - -#endif diff --git a/lib/runtime/src/task_spec/op_task_invocation.cc b/lib/runtime/src/task_spec/op_task_invocation.cc deleted file mode 100644 index fbbfe47726..0000000000 --- a/lib/runtime/src/task_spec/op_task_invocation.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "op_task_invocation.h" -#include "task_argument_accessor.h" - -namespace FlexFlow { - -OpTaskSignature get_signature(task_id_t const &) { - NOT_IMPLEMENTED(); -} - -OpTensorSpec::OpTensorSpec(TensorRole _role, int _idx) - : role(_role), idx(_idx) {} - -OpTensorSpec input_tensor(int idx) { - return {TensorRole::INPUT, idx}; -} - -OpTensorSpec output_tensor(int idx) { - return {TensorRole::OUTPUT, idx}; -} - -OpTensorSpec weight_tensor(int idx) { - return {TensorRole::WEIGHT, idx}; -} - -// OpTaskBinding::OpTaskBinding() { -// this->serializer.reserve_bytes(sizeof(TaskArgumentFormat)); -// } - -void OpTaskBinding::bind(slot_id slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert({{slot, IsGrad::NO}, tensor_spec}); -} - -void OpTaskBinding::bind_grad(slot_id slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert({{slot, IsGrad::YES}, tensor_spec}); -} - -std::unordered_map, OpTensorSpec> const & - OpTaskBinding::get_tensor_bindings() const { - return this->tensor_bindings; -} - -std::unordered_map const & - OpTaskBinding::get_arg_bindings() const { - return this->arg_bindings; -} - -} // namespace FlexFlow diff --git a/lib/runtime/src/task_spec/op_task_invocation.h b/lib/runtime/src/task_spec/op_task_invocation.h deleted file mode 100644 index 56e709734e..0000000000 --- a/lib/runtime/src/task_spec/op_task_invocation.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H -#define _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H - -#include "accessor.h" -#include "index_task_invocation.h" -#include "legion.h" -#include "op_arg_ref.h" -#include "op_task_signature.h" -#include "op_tensor_spec.h" -#include "runtime/config.h" -#include "runtime/profiling.h" -#include "serialization.h" -#include "standard_task_invocation.h" -#include "tasks.h" -#include "utils/bidict.h" -#include "utils/optional.h" -#include "utils/stack_map.h" -#include "variadic_tensor_ref.h" -#include -#include -#include - -namespace FlexFlow { - -enum class IsTrainable { YES, NO }; - -using OpArgSpec = variant; - -struct OpTaskBinding { - OpTaskBinding() = default; - - static_assert(is_subeq_variant::value, ""); - - void bind(slot_id, OpTensorSpec const &); - void bind_grad(slot_id, OpTensorSpec const &); - - template - void bind(slot_id name, VariadicTensorRef const &t) { - NOT_IMPLEMENTED(); - } - - template - void bind_device_specific_arg(slot_id name, T const &t) { - NOT_IMPLEMENTED(); - } - - template - void bind_device_specific_arg(slot_id name, OpArgRef const &t) { - NOT_IMPLEMENTED(); - } - - template - void bind_arg(slot_id name, T const &t) { - this->insert_arg_spec(name, ConcreteArgSpec::create(t)); - } - - template - void bind_arg(slot_id name, RuntimeArgRef const &ref) { - this->insert_arg_spec(name, RuntimeArgRefSpec::create(ref)); - } - - template - void bind_arg(slot_id name, OpArgRef const &ref) { - this->insert_arg_spec(name, OpArgRefSpec::create(ref)); - } - - template - void bind_arg(slot_id name, TypedFuture const &f) { - this->insert_arg_spec(name, CheckedTypedFuture::create(f)); - } - - template - void bind_arg(slot_id name, TypedFutureMap const &fm) { - this->insert_arg_spec(name, CheckedTypedFutureMap::create(fm)); - } - - std::unordered_map, OpTensorSpec> const & - get_tensor_bindings() const; - std::unordered_map const &get_arg_bindings() const; - -private: - void insert_arg_spec(slot_id name, OpArgSpec const &arg_spec) { - assert(!contains_key(this->arg_bindings, name)); - this->arg_bindings.insert({name, arg_spec}); - } - - // template - // ArgSpec generate_arg_spec(T const &t) { - // static_assert(is_serializable, "Type must be serializable"); - - // size_t pre_size = serializer.get_used_bytes(); - // ff_task_serialize(serializer, t); - // size_t post_size = serializer.get_used_bytes(); - // return { - // typeid(T), - // pre_size, - // post_size - pre_size - // }; - // } - - /* Legion::Serializer serializer; */ - std::unordered_map arg_bindings; - std::unordered_map, OpTensorSpec> tensor_bindings; -}; - -struct OpTaskInvocation : public use_visitable_cmp { -public: - OpTaskInvocation() = delete; - OpTaskInvocation(task_id_t const &task_id, OpTaskBinding const &binding) - : task_id(task_id), binding(binding) {} - -public: - task_id_t task_id; - OpTaskBinding binding; -}; - -OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd); -OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); -OpTaskSignature get_op_signature(task_id_t const &); - -/* std::unordered_map get_regions_idxs(TaskArgumentFormat - * const &); */ - -/* TaskArgumentFormat compile_task_invocation(OpTaskSignature const &, - * OpTaskBinding const &); */ - -} // namespace FlexFlow - -#endif diff --git a/lib/runtime/src/task_spec/op_tensor_spec.h b/lib/runtime/src/task_spec/op_tensor_spec.h deleted file mode 100644 index d859bb3072..0000000000 --- a/lib/runtime/src/task_spec/op_tensor_spec.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H - -#include "op_task_signature.h" - -namespace FlexFlow { - -struct OpTensorSpec { - TensorRole role; - req idx; -}; -FF_VISITABLE_STRUCT(OpTensorSpec, role, idx); - -OpTensorSpec input_tensor(int); -OpTensorSpec output_tensor(int); -OpTensorSpec weight_tensor(int); - -} // namespace FlexFlow - -#endif diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h deleted file mode 100644 index 9cc05b8252..0000000000 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ /dev/null @@ -1,193 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H - -#include "accessor.h" -#include "device_specific.h" -#include "realm_allocator.h" -#include "runtime/config.h" -#include "utils/exception.h" -#include "utils/stack_map.h" -#include "utils/strong_typedef.h" -#include - -namespace FlexFlow { - -struct region_idx_t : strong_typedef { - using strong_typedef::strong_typedef; -}; - -FF_TYPEDEF_HASHABLE(region_idx_t); -FF_TYPEDEF_PRINTABLE(region_idx_t, "region_idx"); - -using NonvariadicFormat = region_idx_t; -using VariadicFormat = std::vector; - -using TensorArgumentFormat = variant; - -bool is_variadic(TensorArgumentFormat const &); -VariadicFormat get_variadic_format(TensorArgumentFormat const &); -NonvariadicFormat get_nonvariadic_format(TensorArgumentFormat const &); - -struct TaskArgumentFormat { - std::type_index type; - size_t start; - req end; -}; -FF_VISITABLE_STRUCT(TaskArgumentFormat, type, start, end); - -struct FutureArgumentFormat { - std::type_index type; - req future_idx; -}; -FF_VISITABLE_STRUCT(FutureArgumentFormat, type, future_idx); - -struct TaskArgumentsFormat { - TaskArgumentsFormat() = default; - - stack_map region_idxs; - stack_map args; - stack_map futures; - stack_map regions; - stack_map data_types; - - void insert(std::pair const &); - void insert(std::pair const &); - - void insert(region_idx_t, Legion::PrivilegeMode, DataType); - void insert(slot_id, region_idx_t); - void insert(slot_id, std::vector const &); -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( - TaskArgumentsFormat, region_idxs, args, futures, regions, data_types); - -Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &, - region_idx_t const &); -Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &, - parallel_tensor_guid_t const &); -Permissions get_permissions(TaskArgumentsFormat const &, region_idx_t const &); -Permissions get_permissions(TaskArgumentsFormat const &, - parallel_tensor_guid_t const &); -region_idx_t get_region_idx(TaskArgumentsFormat const &, - parallel_tensor_guid_t const &); -DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &); - -struct TaskArgumentAccessor { - TaskArgumentAccessor(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - - Allocator get_allocator() const { - return get_gpu_memory_allocator(this->task); - } - - template - T const &get_argument(slot_id slot) const { - NOT_IMPLEMENTED(); - // TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); - // std::type_index actual_type = arg_fmt.type; - // std::type_index requested_type = {typeid(T)}; - - // if (actual_type != requested_type) { - // throw mk_runtime_error( - // "Type mismatch in argument access (\"{}\" != \"{}\")", - // actual_type.name(), - // requested_type.name()); - // } - - // void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; - // Legion::Deserializer dez(start_ptr, arg_fmt.start); - - // return ff_task_deserialize(dez); - } - - template - optional get_optional_argument(slot_id) const { - NOT_IMPLEMENTED(); - } - - template - std::vector get_variadic_argument(slot_id) const { - NOT_IMPLEMENTED(); - } - - template - privilege_mode_to_accessor - get_generic_accessor(region_idx_t const &idx) const { - auto tensor_privs = get_permissions(this->args_fmt, idx); - if (tensor_privs != PRIV) { - throw mk_runtime_error( - "Privilege mismatch while accessing tensor: {} != {}", - tensor_privs, - PRIV); - } - - return helperGetGenericTensorAccessor( - get_datatype(this->args_fmt, idx), - regions[idx.value()], - task->regions[idx.value()], - FID_DATA, - ctx, - runtime); - } - - template - privilege_mode_to_accessor get_tensor(slot_id slot) const { - auto argument_format = - get(this->args_fmt.region_idxs.at(slot)); - - return this->get_generic_accessor(argument_format); - } - - template - privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { - NOT_IMPLEMENTED(); - } - - template - std::vector> - get_variadic_tensor(slot_id slot) const { - std::vector> result; - - auto argument_format = - get(this->args_fmt.region_idxs.at(slot)); - for (NonvariadicFormat const &argument : argument_format) { - result.push_back(this->get_generic_accessor(argument)); - } - - return result; - } - - template - std::vector> - get_variadic_tensor_grad(slot_id slot) const { - NOT_IMPLEMENTED(); - } - - template - T *unwrap(DeviceSpecific const &arg) const { - return arg.get(this->get_device_idx()); - } - - template - DeviceSpecific create_device_specific(Args &&...args) const { - return DeviceSpecific::create(this->get_device_idx(), - std::forward(args)...); - } - - size_t get_device_idx() const { - NOT_IMPLEMENTED(); - } - -private: - Legion::Task const *task; - std::vector const ®ions; - Legion::Context ctx; - Legion::Runtime *runtime; - TaskArgumentsFormat const &args_fmt; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/runtime/src/task_spec/variadic_tensor_ref.h b/lib/runtime/src/task_spec/variadic_tensor_ref.h deleted file mode 100644 index a9d1b54731..0000000000 --- a/lib/runtime/src/task_spec/variadic_tensor_ref.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H - -#include "arg_ref.h" -#include "op_tensor_spec.h" - -namespace FlexFlow { - -enum class VariadicTensorRefType { INPUT_TENSORS }; - -template -using VariadicTensorRef = ArgRef; - -VariadicTensorRef get_input_tensors() { - return {VariadicTensorRefType::INPUT_TENSORS}; -} - -} // namespace FlexFlow - -#endif diff --git a/lib/utils/include/utils/type_index.h b/lib/utils/include/utils/type_index.h index 49e893faa0..77a377a48d 100644 --- a/lib/utils/include/utils/type_index.h +++ b/lib/utils/include/utils/type_index.h @@ -3,17 +3,18 @@ #include "fmt.h" #include +#include namespace FlexFlow { template -std::type_index type_index() { +std::type_index get_type_index_for_type() { return std::type_index(typeid(T)); } template bool matches(std::type_index idx) { - return idx == type_index(); + return idx == get_type_index_for_type(); } } // namespace FlexFlow