From 2dc4c60531f233b5cd364107765a907d1567a996 Mon Sep 17 00:00:00 2001 From: reyna-abhyankar Date: Fri, 10 May 2024 10:44:10 -0700 Subject: [PATCH 01/24] Add allocators --- lib/CMakeLists.txt | 1 + lib/local-execution/CMakeLists.txt | 15 +++++++++++ lib/local-execution/include/local_allocator.h | 24 +++++++++++++++++ .../include/tracked_allocator.h | 21 +++++++++++++++ lib/local-execution/src/local_allocator.cc | 20 ++++++++++++++ lib/local-execution/src/tracked_allocator.cc | 27 +++++++++++++++++++ 6 files changed, 108 insertions(+) create mode 100644 lib/local-execution/CMakeLists.txt create mode 100644 lib/local-execution/include/local_allocator.h create mode 100644 lib/local-execution/include/tracked_allocator.h create mode 100644 lib/local-execution/src/local_allocator.cc create mode 100644 lib/local-execution/src/tracked_allocator.cc diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index f7c166f0dd..8ed5d87d86 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(compiler) add_subdirectory(runtime) add_subdirectory(op-attrs) add_subdirectory(kernels) +add_subdirectory(local-execution) add_subdirectory(utils) add_subdirectory(ffi) add_subdirectory(substitutions) diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt new file mode 100644 index 0000000000..ee1d8fecdc --- /dev/null +++ b/lib/local-execution/CMakeLists.txt @@ -0,0 +1,15 @@ +ff_add_library( + NAME + local-execution + SRC_PATTERNS + src/*.cc + PUBLIC_INCLUDE + include/ + PRIVATE_INCLUDE + src/ + DEPS + op-attrs + utils + kernels + pcg +) \ No newline at end of file diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local_allocator.h new file mode 100644 index 0000000000..f4b253b281 --- /dev/null +++ b/lib/local-execution/include/local_allocator.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H +#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H + +#include "kernels/allocation.h" +#include + +namespace FlexFlow { + +struct LocalAllocator : public IAllocator { + LocalAllocator() = default; + LocalAllocator(LocalAllocator const &) = delete; + LocalAllocator(LocalAllocator &&) = delete; + ~LocalAllocator() = default; + + void *allocate(size_t) override; + void deallocate(void *) override; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalAllocator); + +Allocator get_local_memory_allocator(); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h new file mode 100644 index 0000000000..64cc31e858 --- /dev/null +++ b/lib/local-execution/include/tracked_allocator.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H +#define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H + +#include "kernels/allocation.h" + +namespace FlexFlow { + +struct TrackedAllocator: public Allocator { + Allocator() = delete; + + void *allocate(size_t mem_size); + void deallocate(void *ptr); + size_t get_current_mem_usage(); + +private: + size_t current_mem_usage; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local_allocator.cc b/lib/local-execution/src/local_allocator.cc new file mode 100644 index 0000000000..0bb7d04574 --- /dev/null +++ b/lib/local-execution/src/local_allocator.cc @@ -0,0 +1,20 @@ +#include "local_allocator.h" +#include "kernels/device.h" + +namespace FlexFlow { + +void *LocalAllocator::allocate(size_t requested_memory_size) { + void *ptr; + checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + return ptr; +} + +void LocalAllocator::deallocate(void *ptr) { + checkCUDA(cudaFree(ptr)); +} + +Allocator get_local_memory_allocator() { + return Allocator::create(); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc new file mode 100644 index 0000000000..6e666b647c --- /dev/null +++ b/lib/local-execution/src/tracked_allocator.cc @@ -0,0 +1,27 @@ +#include "tracked_allocator.h" +#include "kernels/device.h" + +namespace FlexFlow { + +void *TrackedAllocator::allocate(size_t mem_size) { + void *ptr = this->i_allocator->allocate(mem_size); + this->curr_mem_usage += mem_size; + return ptr; +} + +void TrackedAllocator::deallocate(void *ptr) { + size_t psize; + checkCUDA(cuMemGetAddressRange(nullptr, &psize, ptr)); + this->i_allocator->deallocate(ptr); + this->curr_mem_usage -= psize; +} + +size_t TrackedAllocator::get_current_mem_usage() { + return this->curr_mem_usage; +} + +TrackedAllocator get_tracked_local_allocator() { + return Allocator::create(); +} + +} // namespace FlexFlow From 2488514f98f38ab7985b21f6ee29dace6a8e7f6e Mon Sep 17 00:00:00 2001 From: reyna-abhyankar Date: Tue, 14 May 2024 06:40:39 -0700 Subject: [PATCH 02/24] Computation Graph and Builder --- lib/pcg/include/pcg/computation_graph.h | 103 +++++- .../include/pcg/computation_graph_builder.h | 342 +++++++++--------- lib/pcg/src/computation_graph_builder.cc | 306 ++++++++++------ lib/utils/include/utils/strong_typedef.h | 4 + 4 files changed, 475 insertions(+), 280 deletions(-) diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 11dad70356..4d4fa86efa 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -4,6 +4,8 @@ #include "layer.h" #include "operator_guid_t.h" #include "tensor.h" +#include "tensor_guid_t.h" +#include "utils/containers.h" #include "utils/graph.h" #include "utils/strong_typedef.h" #include "visit_struct/visit_struct.hpp" @@ -14,12 +16,105 @@ struct ComputationGraph : public strong_typedef> { using strong_typedef::strong_typedef; + + std::vector traverse() { + std::vector layers = get_topological_ordering(this->value()); + return transform(layers, [&](Node const &e) -> operator_guid_t { + return operator_guid_t{e}; + }); + } + + std::vector traverse_reverse_order() { + std::vector layers = + reversed>(get_topological_ordering(this->value())); + return transform(layers, [&](Node const &e) -> operator_guid_t { + return operator_guid_t{e}; + }); + } + + bool out_edge_comparator(MultiDiOutput x, MultiDiOutput y) { + return x.src_idx < y.src_idx; + } + + std::vector + sort_edge_set(std::unordered_set edges) { + std::unordered_set outputs = + transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput { + return MultiDiOutput(e); + }); + std::vector sorted_outputs(outputs.begin(), outputs.end()); + sort(sorted_outputs.begin(), sorted_outputs.end(), out_edge_comparator); + return transform(sorted_outputs, + [&](MultiDiOutput const &e) -> tensor_guid_t { + return tensor_guid_t{e}; + }); + } + + std::vector get_outgoing_tensors(operator_guid_t n) { + return sort_edge_set(get_outgoing_edges(this->value(), n.value())); + } + + std::vector get_incoming_tensors(operator_guid_t n) { + return sort_edge_set(get_incoming_edges(this->value(), n.value())); + } + + operator_guid_t add_node(Layer const &layer) { + Node added_node = this->value().add_node(layer); + return operator_guid_t{added_node}; + } + + void add_output(tensor_guid_t const &output, Tensor const &tensor) { + this->value().add_output(output.value(), tensor); + } + + tensor_guid_t create_outgoing_edge(operator_guid_t node, int idx) { + MultiDiOutput edge = {node.value(), NodePort{idx}}; + return tensor_guid_t{edge}; + } + + tensor_guid_t create_outgoing_edge_with_label(operator_guid_t node, + int idx, + Tensor tensor) { + tensor_guid_t tensor_guid = create_outgoing_edge(node, idx); + add_output(tensor_guid, tensor); + return tensor_guid; + } + + void add_incoming_edges(std::vector const &incoming_edges, + operator_guid_t node) { + size_t incoming_edge_dst_port = 0; + for (tensor_guid_t input : incoming_edges) { + MultiDiOutput input_view = input.value(); + MultiDiEdge edge = {node.value(), + NodePort{incoming_edge_dst_port++}, + input_view.src, + input_view.src_idx}; + this->value().add_edge(edge); + } + } + + Layer &at(operator_guid_t const &n) { + return this->value().at(n.value()); + } + + Layer const &at(operator_guid_t const &n) const { + return this->value().at(n.value()); + } + + Tensor &at(tensor_guid_t const &e) { + return this->value().at(e.value()); + } + + Tensor const &at(tensor_guid_t const &e) const { + return this->value().at(e.value()); + } + + CompGraphOperatorAttrs get_layer_attrs(operator_guid_t const &n) const { + return this->at(n).attrs; + } }; +CHECK_WELL_BEHAVED_VALUE_TYPE_NO_HASH(ComputationGraph); } // namespace FlexFlow -namespace FlexFlow { -static_assert(is_well_behaved_value_type_no_hash::value, ""); -} - #endif diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index 035f0cad0b..1be8d7ad0e 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_PCG_INCLUDE_PCG_COMPUTATION_GRAPH_BUILDER_H #include "computation_graph.h" +#include "optimizer.h" namespace FlexFlow { @@ -12,74 +13,78 @@ struct ComputationGraphBuilder // C++ APIs for constructing models // Add an exp layer - Tensor exp(Tensor const &, - std::optional const &name = std::nullopt); + tensor_guid_t exp(tensor_guid_t const &, + std::optional const &name = std::nullopt); // Add an add layer - Tensor add(Tensor const &x, - Tensor const &y, - std::optional const &name = std::nullopt); + tensor_guid_t add(tensor_guid_t const &x, + tensor_guid_t const &y, + std::optional const &name = std::nullopt); // Add a subtract layer - Tensor subtract(Tensor const &x, - Tensor const &y, - std::optional const &name = std::nullopt); + tensor_guid_t subtract(tensor_guid_t const &x, + tensor_guid_t const &y, + std::optional const &name = std::nullopt); // Add a multiply layer - Tensor multiply(Tensor const &x, - Tensor const &y, - std::optional const &name = std::nullopt); + tensor_guid_t multiply(tensor_guid_t const &x, + tensor_guid_t const &y, + std::optional const &name = std::nullopt); // Add a divide layer - Tensor divide(Tensor const &x, - Tensor const &y, - std::optional const &name = std::nullopt); + tensor_guid_t divide(tensor_guid_t const &x, + tensor_guid_t const &y, + std::optional const &name = std::nullopt); // Add a max layer - Tensor max(Tensor const &x, - Tensor const &y, - std::optional const &name = std::nullopt); + tensor_guid_t max(tensor_guid_t const &x, + tensor_guid_t const &y, + std::optional const &name = std::nullopt); // Add a min layer - Tensor min(Tensor const &x, - Tensor const &y, - std::optional const &name = std::nullopt); + tensor_guid_t min(tensor_guid_t const &x, + tensor_guid_t const &y, + std::optional const &name = std::nullopt); // Add a rsqrt layer - Tensor rsqrt(Tensor const &x, - std::optional const &name = std::nullopt); + tensor_guid_t rsqrt(tensor_guid_t const &x, + std::optional const &name = std::nullopt); // Add a pow layer - Tensor pow(Tensor const &x, - float exponent, - std::optional const &name = std::nullopt); - // Add a scalar multiply layer - Tensor scalar_multiply(Tensor const &x, - float scalar, - std::optional const &name = std::nullopt); - Tensor scalar_add(Tensor const &x, - float scalar, - std::optional const &name = std::nullopt); - Tensor scalar_sub(Tensor const &lhs, - float rhs, + tensor_guid_t pow(tensor_guid_t const &x, + float exponent, std::optional const &name = std::nullopt); - Tensor scalar_truediv(Tensor const &numerator, - float denominator, - std::optional const &name = std::nullopt); + // Add a scalar multiply layer + tensor_guid_t + scalar_multiply(tensor_guid_t const &x, + float scalar, + std::optional const &name = std::nullopt); + tensor_guid_t + scalar_add(tensor_guid_t const &x, + float scalar, + std::optional const &name = std::nullopt); + tensor_guid_t + scalar_sub(tensor_guid_t const &lhs, + float rhs, + std::optional const &name = std::nullopt); + tensor_guid_t + scalar_truediv(tensor_guid_t const &numerator, + float denominator, + std::optional const &name = std::nullopt); // Add a sin layer - Tensor sin(Tensor const &x, - std::optional const &name = std::nullopt); + tensor_guid_t sin(tensor_guid_t const &x, + std::optional const &name = std::nullopt); // Add a cos layer - Tensor cos(Tensor const &x, - std::optional const &name = std::nullopt); + tensor_guid_t cos(tensor_guid_t const &x, + std::optional const &name = std::nullopt); // Add an activation layer - Tensor relu(Tensor const &x, - std::optional const &name = std::nullopt); - Tensor identity(Tensor const &x, - std::optional const &name = std::nullopt); - Tensor gelu(Tensor const &x, - std::optional const &name = std::nullopt); - Tensor sigmoid(Tensor const &x, - std::optional const &name = std::nullopt); - Tensor tanh(Tensor const &x, - std::optional const &name = std::nullopt); - Tensor elu(Tensor const &x, - std::optional const &name = std::nullopt); + tensor_guid_t relu(tensor_guid_t const &x, + std::optional const &name = std::nullopt); + tensor_guid_t identity(tensor_guid_t const &x, + std::optional const &name = std::nullopt); + tensor_guid_t gelu(tensor_guid_t const &x, + std::optional const &name = std::nullopt); + tensor_guid_t sigmoid(tensor_guid_t const &x, + std::optional const &name = std::nullopt); + tensor_guid_t tanh(tensor_guid_t const &x, + std::optional const &name = std::nullopt); + tensor_guid_t elu(tensor_guid_t const &x, + std::optional const &name = std::nullopt); // Add a 2D convolutional layer - Tensor conv2d( - Tensor const &input, + tensor_guid_t conv2d( + tensor_guid_t const &input, int outChannels, int kernelH, int kernelW, @@ -95,13 +100,13 @@ struct ComputationGraphBuilder std::optional const &kernel_regularizer = std::nullopt, std::optional const &name = std::nullopt); // Add a dropout layer - Tensor dropout(Tensor const &input, - float rate, - unsigned long long seed = 0, - std::optional const &name = std::nullopt); + tensor_guid_t dropout(tensor_guid_t const &input, + float rate, + unsigned long long seed = 0, + std::optional const &name = std::nullopt); // Add an embedding layer - Tensor embedding( - Tensor const &input, + tensor_guid_t embedding( + tensor_guid_t const &input, int num_entries, int outDim, AggregateOp aggr, @@ -109,43 +114,48 @@ struct ComputationGraphBuilder std::optional const &kernel_initializer = std::nullopt, std::optional const &name = std::nullopt); // Add a gather layer - std::vector - gather(Tensor const &input, - Tensor const &index, + std::vector + gather(tensor_guid_t const &input, + tensor_guid_t const &index, ff_dim_t dim, std::optional const &name = std::nullopt); // Add a cache layer - Tensor cache(Tensor const &input, - int num_batches, - std::function - score_f = {}, - std::optional const &name = std::nullopt); + tensor_guid_t + cache(tensor_guid_t const &input, + int num_batches, + std::function + score_f = {}, + std::optional const &name = std::nullopt); // Add a 2D pooling layer - Tensor pool2d(Tensor const &input, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, - PoolOp type = PoolOp::MAX, - std::optional const &activation = std::nullopt, - std::optional const &name = std::nullopt); - Tensor layer_norm(Tensor const &input, - std::vector const &axes, - bool elementwise_affine, - float eps, - std::optional const &name = std::nullopt); - Tensor batch_norm(Tensor const &input, - bool relu = true, - std::optional const &name = std::nullopt); - Tensor batch_matmul(Tensor const &A, - Tensor const &B, - int a_seq_length_dim = -1, - int b_seq_length_dim = -1, - std::optional const &name = std::nullopt); - Tensor - dense(Tensor const &input, + tensor_guid_t + pool2d(tensor_guid_t const &input, + int kernelH, + int kernelW, + int strideH, + int strideW, + int paddingH, + int paddingW, + PoolOp type = PoolOp::MAX, + std::optional const &activation = std::nullopt, + std::optional const &name = std::nullopt); + tensor_guid_t + layer_norm(tensor_guid_t const &input, + std::vector const &axes, + bool elementwise_affine, + float eps, + std::optional const &name = std::nullopt); + tensor_guid_t + batch_norm(tensor_guid_t const &input, + bool relu = true, + std::optional const &name = std::nullopt); + tensor_guid_t + batch_matmul(tensor_guid_t const &A, + tensor_guid_t const &B, + int a_seq_length_dim = -1, + int b_seq_length_dim = -1, + std::optional const &name = std::nullopt); + tensor_guid_t + dense(tensor_guid_t const &input, int outDim, std::optional activation = std::nullopt, bool use_bias = true, @@ -154,55 +164,59 @@ struct ComputationGraphBuilder std::optional const &bias_initializer = std::nullopt, std::optional const &name = std::nullopt); // Add a cast layer - Tensor cast(Tensor const &input, - DataType dtype, - std::optional const &name = std::nullopt); + tensor_guid_t cast(tensor_guid_t const &input, + DataType dtype, + std::optional const &name = std::nullopt); // Add a concat layer - Tensor concat(int n, - std::vector const &tensors, - int axis, - std::optional const &name = std::nullopt); + tensor_guid_t concat(int n, + std::vector const &tensors, + int axis, + std::optional const &name = std::nullopt); // Add a mean layer - Tensor mean(Tensor const &input, - std::vector const &dims, - bool keepdims, - char const *name); + tensor_guid_t mean(tensor_guid_t const &input, + std::vector const &dims, + bool keepdims, + char const *name); // Add a split layer - void split(Tensor const &input, - Tensor *outputs, + void split(tensor_guid_t const &input, + tensor_guid_t *outputs, std::vector const &split, int axis, std::optional const &name = std::nullopt); // Add a flat layer - Tensor flat(Tensor const &input, - std::optional const &name = std::nullopt); + tensor_guid_t flat(tensor_guid_t const &input, + std::optional const &name = std::nullopt); // Add a softmax layer - Tensor softmax(Tensor const &input, - int dim = -1, - std::optional const &name = std::nullopt); + tensor_guid_t softmax(tensor_guid_t const &input, + int dim = -1, + std::optional const &name = std::nullopt); // Create input tensors and constants - Tensor transpose(Tensor const &input, - std::vector const &perm, - std::optional const &name = std::nullopt); - Tensor reduce_sum(Tensor const &input, - std::vector const &axes, - bool keepdims = false, - std::optional const &name = std::nullopt); - Tensor reshape(Tensor const &input, - std::vector const &shape, - std::optional const &name = std::nullopt); - Tensor reverse(Tensor const &input, - int axis, + tensor_guid_t input(Tensor const &input_tensor, + std::optional const &name = std::nullopt); + tensor_guid_t + transpose(tensor_guid_t const &input, + std::vector const &perm, + std::optional const &name = std::nullopt); + tensor_guid_t + reduce_sum(tensor_guid_t const &input, + std::vector const &axes, + bool keepdims = false, std::optional const &name = std::nullopt); - void top_k(Tensor const &input, - Tensor *outputs, + tensor_guid_t reshape(tensor_guid_t const &input, + std::vector const &shape, + std::optional const &name = std::nullopt); + tensor_guid_t reverse(tensor_guid_t const &input, + int axis, + std::optional const &name = std::nullopt); + void top_k(tensor_guid_t const &input, + tensor_guid_t *outputs, int k, bool sorted, std::optional const &name = std::nullopt); - Tensor multihead_attention( - Tensor const &query, - Tensor const &key, - Tensor const &value, + tensor_guid_t multihead_attention( + tensor_guid_t const &query, + tensor_guid_t const &key, + tensor_guid_t const &value, int embed_dim, int num_heads, int kdim = 0, @@ -213,62 +227,66 @@ struct ComputationGraphBuilder bool add_zero_attn = false, std::optional initializer = std::nullopt, std::optional const &name = std::nullopt); - Tensor create_tensor(TensorShape const &, bool create_grad = true); + tensor_guid_t create_tensor(TensorShape const &, bool create_grad = true); Parameter create_weight( TensorShape const &, bool create_grad = true, std::optional const &initializer = std::nullopt, std::optional sync_type = std::nullopt); - std::vector get_outputs(Layer const &) const; - Tensor get_output(Layer const &, int idx) const; - - Tensor at(MultiDiEdge const &) const; - Layer at(Node const &) const; + std::vector get_outputs(operator_guid_t const &) const; + tensor_guid_t get_output(operator_guid_t const &, int idx) const; + Tensor get_tensor(tensor_guid_t const &) const; private: - Tensor broadcast(Tensor const &, TensorShape const &); + tensor_guid_t broadcast(tensor_guid_t const &, TensorShape const &); void add_layer(Layer const &layer, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs); - Tensor add_layer( + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs); + tensor_guid_t add_layer( Layer const &layer, - std::vector const &inputs, + std::vector const &inputs, std::vector>> const &weight_shapes, - TensorShape const &output_shape); - std::vector add_layer( + Tensor const &output); + std::vector add_layer( Layer const &layer, - std::vector const &inputs, + std::vector const &inputs, std::vector>> const &weight_shapes, - std::vector const &output_shapes); + std::vector const &outputs); - Tensor as_type(Tensor const &, DataType, std::string const &); + tensor_guid_t as_type(tensor_guid_t const &, DataType, std::string const &); TensorShape get_broadcast_target_shape(std::vector const &); - Tensor element_binary(OperatorType, - Tensor const &lhs, - Tensor const &rhs, - std::optional const &name = std::nullopt); + tensor_guid_t + element_binary(OperatorType, + tensor_guid_t const &lhs, + tensor_guid_t const &rhs, + std::optional const &name = std::nullopt); - Tensor element_unary(OperatorType, - Tensor const &input, - std::optional const &name = std::nullopt); - Tensor element_scalar_unary( + tensor_guid_t + element_unary(OperatorType, + tensor_guid_t const &input, + std::optional const &name = std::nullopt); + tensor_guid_t element_scalar_unary( OperatorType, - Tensor const &input, + tensor_guid_t const &input, float scalar, std::optional const &name = std::nullopt); - Tensor element_unary(ElementUnaryAttrs const &, - Tensor const &input, - std::optional const &name = std::nullopt); - Tensor element_scalar_unary(ElementScalarUnaryAttrs const &attrs, - Tensor const &x, - std::optional const &maybe_name); + tensor_guid_t + element_unary(ElementUnaryAttrs const &, + tensor_guid_t const &input, + std::optional const &name = std::nullopt); + tensor_guid_t + element_scalar_unary(ElementScalarUnaryAttrs const &attrs, + tensor_guid_t const &x, + std::optional const &maybe_name); + + std::unordered_map pre_edge_mapping; public: ComputationGraph computation_graph; diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc index c2e008231e..f308a4b242 100644 --- a/lib/pcg/src/computation_graph_builder.cc +++ b/lib/pcg/src/computation_graph_builder.cc @@ -6,40 +6,48 @@ namespace FlexFlow { -void ComputationGraphBuilder::add_layer(Layer const &layer, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs) { - NOT_IMPLEMENTED(); -} -Tensor ComputationGraphBuilder::add_layer( +tensor_guid_t ComputationGraphBuilder::add_layer( Layer const &layer, - std::vector const &inputs, + std::vector const &inputs, std::vector>> const &weight_shapes, - TensorShape const &output_shape) { - NOT_IMPLEMENTED(); + Tensor const &output) { + operator_guid_t node = computation_graph.add_node(layer); + this->computation_graph.add_incoming_edges(inputs, node); + return this->computation_graph.create_outgoing_edge_with_label( + node, 0, output); } -std::vector ComputationGraphBuilder::add_layer( + +std::vector ComputationGraphBuilder::add_layer( Layer const &layer, - std::vector const &inputs, + std::vector const &inputs, std::vector>> const &weight_shapes, - std::vector const &output_shapes) { - NOT_IMPLEMENTED(); + std::vector const &outputs) { + operator_guid_t node = computation_graph.add_node(layer); + this->computation_graph.add_incoming_edges(inputs, node); + std::vector output_tensor_guids; + for (int i = 0; i < outputs.size(); ++i) { + output_tensor_guids.push_back( + this->computation_graph.create_outgoing_edge_with_label( + node, i, outputs[i])); + } + return output_tensor_guids; } -Tensor ComputationGraphBuilder::broadcast(Tensor const &, TensorShape const &) { +tensor_guid_t ComputationGraphBuilder::broadcast(tensor_guid_t const &, + TensorShape const &) { NOT_IMPLEMENTED(); } -Tensor ComputationGraphBuilder::cast(Tensor const &input, - DataType dtype, - std::optional const &name){ - NOT_IMPLEMENTED()} +tensor_guid_t + ComputationGraphBuilder::cast(tensor_guid_t const &input, + DataType dtype, + std::optional const &name){ + NOT_IMPLEMENTED()} -Tensor ComputationGraphBuilder::as_type(Tensor const &x, - DataType data_type, - std::string const &name) { +tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x, + DataType data_type, + std::string const &name) { if (x.data_type < data_type) { return this->cast(x, data_type, name); } else if (x.data_type > data_type) { @@ -64,13 +72,14 @@ static std::string get_default_name(std::variant const &attrs) { return get_default_name(widen(attrs)); } -Tensor ComputationGraphBuilder::element_unary( +tensor_guid_t ComputationGraphBuilder::element_unary( ElementUnaryAttrs const &attrs, - Tensor const &x, + tensor_guid_t const &x, std::optional const &maybe_name) { std::string name = maybe_name.value_or(get_default_name(attrs)); - Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); + tensor_guid_t input = + this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); Layer layer = {attrs, name}; TensorShape output_shape = get_output_shape(attrs, input); @@ -78,13 +87,14 @@ Tensor ComputationGraphBuilder::element_unary( return this->add_layer(layer, {input}, {}, output_shape); } -Tensor ComputationGraphBuilder::element_scalar_unary( +tensor_guid_t ComputationGraphBuilder::element_scalar_unary( ElementScalarUnaryAttrs const &attrs, - Tensor const &x, + tensor_guid_t const &x, std::optional const &maybe_name) { std::string name = maybe_name.value_or(get_default_name(attrs)); - Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); + tensor_guid_t input = + this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); Layer layer = {attrs, name}; TensorShape output_shape = get_output_shape(attrs, input); @@ -92,39 +102,41 @@ Tensor ComputationGraphBuilder::element_scalar_unary( return this->add_layer(layer, {input}, {}, output_shape); } -Tensor ComputationGraphBuilder::element_unary( +tensor_guid_t ComputationGraphBuilder::element_unary( OperatorType op_type, - Tensor const &input, + tensor_guid_t const &input, std::optional const &name) { ElementUnaryAttrs attrs = {op_type}; return this->element_unary(attrs, input, name); } -Tensor ComputationGraphBuilder::element_scalar_unary( +tensor_guid_t ComputationGraphBuilder::element_scalar_unary( OperatorType op_type, - Tensor const &input, + tensor_guid_t const &input, float scalar, std::optional const &name) { ElementScalarUnaryAttrs attrs = {op_type, scalar}; return this->element_scalar_unary(attrs, input, name); } -Tensor ComputationGraphBuilder::element_binary( +tensor_guid_t ComputationGraphBuilder::element_binary( OperatorType op_type, - Tensor const &lhs, - Tensor const &rhs, + tensor_guid_t const &lhs, + tensor_guid_t const &rhs, std::optional const &maybe_name) { std::string name = maybe_name.value_or(get_default_name(op_type)); TensorShape compute_shape = this->get_broadcast_target_shape({lhs, rhs}); DataType compute_type = std::max(lhs.data_type, rhs.data_type); - Tensor const lhs_input = this->as_type(this->broadcast(lhs, compute_shape), - compute_type, - name + "_inputl_pre_cast"); - Tensor const rhs_input = this->as_type(this->broadcast(rhs, compute_shape), - compute_type, - name + "_inputr_pre_cast"); + tensor_guid_t const lhs_input = + this->as_type(this->broadcast(lhs, compute_shape), + compute_type, + name + "_inputl_pre_cast"); + tensor_guid_t const rhs_input = + this->as_type(this->broadcast(rhs, compute_shape), + compute_type, + name + "_inputr_pre_cast"); ElementBinaryAttrs attrs = {op_type, compute_type, false, false}; @@ -134,127 +146,179 @@ Tensor ComputationGraphBuilder::element_binary( return this->add_layer(layer, {lhs_input, rhs_input}, {}, output_shape); } -Tensor ComputationGraphBuilder::exp(Tensor const &input, - std::optional const &name) { +tensor_guid_t ComputationGraphBuilder::dense( + tensor_guid_t const &input, + int outDim, + std::optional activation, + bool use_bias, + DataType data_type, + std::optional const &kernel_initializer, + std::optional const &bias_initializer, + std::optional const &name) { + LinearAttrs attrs = { + outDim, use_bias, data_type, activation.value(), std::nullopt}; + std::string unwrapped_name = name.value_or(get_default_name(attrs)); + + tensor_guid_t input_recast = + this->as_type(input, data_type, unwrapped_name + "input_recast"); + + Layer layer = {attrs, name}; + TensorShape output_shape = get_output_shape(attrs, input_recast); + Tensor output = { + output_shape.dims, data_type, std::nullopt, false, std::nullopt}; + + std::vector>> weights; + + weights.push_back( + {get_weights_shape(attrs, input_recast), kernel_initializer}); + + if (use_bias) { + weights.push_back({get_bias_shape(attrs, input_recast), bias_initializer}); + } + + return this->add_layer(layer, {input_recast}, weights, output); +} + +tensor_guid_t + ComputationGraphBuilder::exp(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::EXP, input, name); } -Tensor ComputationGraphBuilder::add(Tensor const &lhs, - Tensor const &rhs, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::add(tensor_guid_t const &lhs, + tensor_guid_t const &rhs, + std::optional const &name) { return this->element_binary(Op::EW_ADD, lhs, rhs, name); } -Tensor - ComputationGraphBuilder::subtract(Tensor const &lhs, - Tensor const &rhs, +tensor_guid_t + ComputationGraphBuilder::subtract(tensor_guid_t const &lhs, + tensor_guid_t const &rhs, std::optional const &name) { return this->element_binary(Op::EW_SUB, lhs, rhs, name); } -Tensor - ComputationGraphBuilder::multiply(Tensor const &lhs, - Tensor const &rhs, +tensor_guid_t + ComputationGraphBuilder::multiply(tensor_guid_t const &lhs, + tensor_guid_t const &rhs, std::optional const &name) { return this->element_binary(Op::EW_MUL, lhs, rhs, name); } -Tensor ComputationGraphBuilder::divide(Tensor const &lhs, - Tensor const &rhs, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::divide(tensor_guid_t const &lhs, + tensor_guid_t const &rhs, + std::optional const &name) { return this->element_binary(Op::EW_DIV, lhs, rhs, name); } -Tensor ComputationGraphBuilder::max(Tensor const &lhs, - Tensor const &rhs, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::max(tensor_guid_t const &lhs, + tensor_guid_t const &rhs, + std::optional const &name) { return this->element_binary(Op::EW_MAX, lhs, rhs, name); } -Tensor ComputationGraphBuilder::min(Tensor const &lhs, - Tensor const &rhs, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::min(tensor_guid_t const &lhs, + tensor_guid_t const &rhs, + std::optional const &name) { return this->element_binary(Op::EW_MIN, lhs, rhs, name); } -Tensor ComputationGraphBuilder::rsqrt(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::rsqrt(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::RSQRT, input, name); } -Tensor ComputationGraphBuilder::pow(Tensor const &input, - float exponent, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::pow(tensor_guid_t const &input, + float exponent, + std::optional const &name) { return this->element_scalar_unary(Op::POW, input, exponent, name); } -Tensor ComputationGraphBuilder::scalar_multiply( - Tensor const &input, float scalar, std::optional const &name) { +tensor_guid_t ComputationGraphBuilder::scalar_multiply( + tensor_guid_t const &input, + float scalar, + std::optional const &name) { return this->element_scalar_unary(Op::SCALAR_MULTIPLY, input, scalar, name); } -Tensor ComputationGraphBuilder::scalar_add( - Tensor const &input, float scalar, std::optional const &name) { +tensor_guid_t ComputationGraphBuilder::scalar_add( + tensor_guid_t const &input, + float scalar, + std::optional const &name) { return this->element_scalar_unary(Op::SCALAR_ADD, input, scalar, name); } -Tensor ComputationGraphBuilder::scalar_sub( - Tensor const &lhs, float rhs, std::optional const &name) { +tensor_guid_t ComputationGraphBuilder::scalar_sub( + tensor_guid_t const &lhs, + float rhs, + std::optional const &name) { return this->element_scalar_unary(Op::SCALAR_SUB, lhs, rhs, name); } -Tensor ComputationGraphBuilder::scalar_truediv( - Tensor const &numerator, +tensor_guid_t ComputationGraphBuilder::scalar_truediv( + tensor_guid_t const &numerator, float denominator, std::optional const &name) { return this->element_scalar_unary( Op::SCALAR_TRUE_DIV, numerator, denominator, name); } -Tensor ComputationGraphBuilder::sin(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::sin(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::SIN, input, name); } -Tensor ComputationGraphBuilder::cos(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::cos(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::COS, input, name); } -Tensor ComputationGraphBuilder::relu(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::relu(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::RELU, input, name); } -Tensor - ComputationGraphBuilder::identity(Tensor const &input, +tensor_guid_t + ComputationGraphBuilder::identity(tensor_guid_t const &input, std::optional const &name) { return this->element_unary(Op::IDENTITY, input, name); } -Tensor ComputationGraphBuilder::gelu(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::gelu(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::GELU, input, name); } -Tensor - ComputationGraphBuilder::sigmoid(Tensor const &input, +tensor_guid_t + ComputationGraphBuilder::sigmoid(tensor_guid_t const &input, std::optional const &name) { return this->element_unary(Op::SIGMOID, input, name); } -Tensor ComputationGraphBuilder::tanh(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::tanh(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::TANH, input, name); } -Tensor ComputationGraphBuilder::elu(Tensor const &input, - std::optional const &name) { +tensor_guid_t + ComputationGraphBuilder::elu(tensor_guid_t const &input, + std::optional const &name) { return this->element_unary(Op::ELU, input, name); } -Tensor ComputationGraphBuilder::conv2d( - Tensor const &x, +tensor_guid_t ComputationGraphBuilder::conv2d( + tensor_guid_t const &x, int outChannels, int kernelH, int kernelW, @@ -281,7 +345,8 @@ Tensor ComputationGraphBuilder::conv2d( use_bias}; std::string name = maybe_name.value_or(get_default_name(attrs)); - Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); + tensor_guid_t input = + this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); Layer layer = {attrs, name}; TensorShape output_shape = get_output_shape(attrs, input); @@ -297,8 +362,8 @@ Tensor ComputationGraphBuilder::conv2d( return this->add_layer(layer, {input}, weights, output_shape); } -Tensor ComputationGraphBuilder::dropout( - Tensor const &x, +tensor_guid_t ComputationGraphBuilder::dropout( + tensor_guid_t const &x, float rate, unsigned long long seed, std::optional const &maybe_name) { @@ -306,15 +371,16 @@ Tensor ComputationGraphBuilder::dropout( std::string name = maybe_name.value_or(get_default_name(attrs)); Layer layer = {attrs, name}; - Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); + tensor_guid_t input = + this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); TensorShape output_shape = get_output_shape(attrs, input); return this->add_layer(layer, {input}, {}, output_shape); } -Tensor ComputationGraphBuilder::embedding( - Tensor const &x, +tensor_guid_t ComputationGraphBuilder::embedding( + tensor_guid_t const &x, int num_entries, int outDim, AggregateOp aggr, @@ -325,7 +391,8 @@ Tensor ComputationGraphBuilder::embedding( std::string name = maybe_name.value_or(get_default_name(attrs)); Layer layer = {attrs, name}; - Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); + tensor_guid_t input = + this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); TensorShape output_shape = get_output_shape(attrs, input); TensorShape weights_shape = get_weights_shape(attrs, input); @@ -334,9 +401,9 @@ Tensor ComputationGraphBuilder::embedding( layer, {input}, {{weights_shape, kernel_initializer}}, output_shape); } -std::vector ComputationGraphBuilder::gather( - Tensor const &input, - Tensor const &index, +std::vector ComputationGraphBuilder::gather( + tensor_guid_t const &input, + tensor_guid_t const &index, ff_dim_t dim, std::optional const &maybe_name) { GatherAttrs attrs = {dim}; @@ -357,19 +424,30 @@ std::vector ComputationGraphBuilder::gather( return this->add_layer(layer, {input}, {}, output_shapes); } -TensorShape get_shape(Tensor const &t) { - return t.get_shape(); +tensor_guid_t + ComputationGraphBuilder::input(Tensor const &input_tensor, + std::optional const &name) { + InputAttrs input_attrs = {}; + std::string name = name.value_or(get_default_name(input_attrs)); + + Layer layer = {attrs, name}; + + return this->add_layer(layer, {}, {}, input_tensor); +} + +TensorShape get_shape(tensor_guid_t const &t) { + return this->computation_graph.at(t).get_shape(); } -std::vector get_shape(std::vector const &) { +std::vector get_shape(std::vector const &) { NOT_IMPLEMENTED(); } -// Tensor ComputationGraphBuilder::aggregate( -// Tensor const &gate_preds, -// Tensor const &gate_assign, -// Tensor const &true_gate_assign, -// Tensor const &full_gate_gradients, -// std::vector const &exp_preds, +// tensor_guid_t ComputationGraphBuilder::aggregate( +// tensor_guid_t const &gate_preds, +// tensor_guid_t const &gate_assign, +// tensor_guid_t const &true_gate_assign, +// tensor_guid_t const &full_gate_gradients, +// std::vector const &exp_preds, // int n, // float lambda_bal, // std::optional const &maybe_name) { @@ -384,14 +462,14 @@ std::vector get_shape(std::vector const &) { // get_shape(full_gate_gradients), // get_shape(exp_preds)); -// std::vector inputs = { +// std::vector inputs = { // gate_preds, gate_assign, true_gate_assign, full_gate_gradients}; // extend(inputs, exp_preds); // return this->add_layer(layer, inputs, {}, output_shape); // } -Tensor ComputationGraphBuilder::batch_norm( - Tensor const &input, +tensor_guid_t ComputationGraphBuilder::batch_norm( + tensor_guid_t const &input, bool relu, std::optional const &maybe_name) { BatchNormAttrs attrs = BatchNormAttrs{relu}; diff --git a/lib/utils/include/utils/strong_typedef.h b/lib/utils/include/utils/strong_typedef.h index f700a20c79..63c41e0e5e 100644 --- a/lib/utils/include/utils/strong_typedef.h +++ b/lib/utils/include/utils/strong_typedef.h @@ -65,6 +65,10 @@ class strong_typedef { return value_; } + T &value() noexcept { + return value_; + } + template strong_typedef fmap(F const &f) { static_assert( From 9a59f34fded4af8f2061c1e95d4ee4679340404e Mon Sep 17 00:00:00 2001 From: reyna-abhyankar Date: Tue, 14 May 2024 06:47:15 -0700 Subject: [PATCH 03/24] Shift ops and remove legion names --- .../src/ops/attention.cc | 110 +++++---- .../src/ops/attention.h | 2 +- .../src/ops/batch_matmul.cc | 36 +-- .../src/ops/batch_matmul.h | 4 +- .../src/ops/batch_norm.cc | 69 ++---- .../src/ops/batch_norm.h | 2 +- .../src/ops/cast.cc | 37 +--- .../src/ops/cast.h | 2 +- .../src/ops/combine.cc | 34 +-- .../src/ops/combine.h | 2 +- .../src/ops/concat.cc | 41 +--- .../src/ops/concat.h | 2 +- .../src/ops/conv_2d.cc | 76 ++----- .../src/ops/conv_2d.h | 2 +- .../src/ops/dropout.cc | 55 ++--- .../src/ops/dropout.h | 2 +- .../src/ops/element_binary.cc | 64 ++---- .../src/ops/element_binary.h | 0 .../src/ops/element_unary.cc | 61 ++--- .../src/ops/element_unary.h | 3 - .../src/ops/embedding.cc | 38 +--- .../src/ops/embedding.h | 2 +- .../src/ops/flat.cc | 32 +-- .../src/ops/flat.h | 0 .../src/ops/layer_norm.cc | 135 +++++------- .../src/ops/layer_norm.h | 0 .../src/ops/linear.cc | 208 ++++++++---------- .../src/ops/linear.h | 0 .../src/ops/noop.cc | 2 +- .../src/ops/noop.h | 2 +- .../src/ops/parallel_op.h | 2 +- .../src/ops/partition.cc | 119 ++++------ .../src/ops/pool_2d.cc | 142 +++++------- .../src/ops/pool_2d.h | 2 +- .../src/ops/reduce.cc | 108 ++++----- .../src/ops/reduce.h | 2 +- .../src/ops/reduction.cc | 95 +++----- .../src/ops/reduction.h | 4 +- .../src/ops/repartition.h | 2 +- .../src/ops/replicate.cc | 75 +++---- .../src/ops/replicate.h | 2 +- .../src/ops/reshape.cc | 80 ++----- .../src/ops/reshape.h | 2 +- .../src/ops/reverse.cc | 80 +++---- .../src/ops/reverse.h | 2 +- .../src/ops/softmax.cc | 96 +++----- .../src/ops/softmax.h | 2 +- .../src/ops/split.cc | 110 ++++----- .../src/ops/split.h | 4 +- .../src/ops/topk.cc | 136 +++++------- .../src/ops/topk.h | 2 +- .../src/ops/transpose.cc | 114 ++++------ .../src/ops/transpose.h | 2 +- 53 files changed, 782 insertions(+), 1424 deletions(-) rename lib/{runtime => local-execution}/src/ops/attention.cc (84%) rename lib/{runtime => local-execution}/src/ops/attention.h (96%) rename lib/{runtime => local-execution}/src/ops/batch_matmul.cc (88%) rename lib/{runtime => local-execution}/src/ops/batch_matmul.h (90%) rename lib/{runtime => local-execution}/src/ops/batch_norm.cc (81%) rename lib/{runtime => local-execution}/src/ops/batch_norm.h (95%) rename lib/{runtime => local-execution}/src/ops/cast.cc (80%) rename lib/{runtime => local-execution}/src/ops/cast.h (97%) rename lib/{runtime => local-execution}/src/ops/combine.cc (77%) rename lib/{runtime => local-execution}/src/ops/combine.h (94%) rename lib/{runtime => local-execution}/src/ops/concat.cc (79%) rename lib/{runtime => local-execution}/src/ops/concat.h (95%) rename lib/{runtime => local-execution}/src/ops/conv_2d.cc (78%) rename lib/{runtime => local-execution}/src/ops/conv_2d.h (95%) rename lib/{runtime => local-execution}/src/ops/dropout.cc (77%) rename lib/{runtime => local-execution}/src/ops/dropout.h (95%) rename lib/{runtime => local-execution}/src/ops/element_binary.cc (81%) rename lib/{runtime => local-execution}/src/ops/element_binary.h (100%) rename lib/{runtime => local-execution}/src/ops/element_unary.cc (77%) rename lib/{runtime => local-execution}/src/ops/element_unary.h (91%) rename lib/{runtime => local-execution}/src/ops/embedding.cc (82%) rename lib/{runtime => local-execution}/src/ops/embedding.h (94%) rename lib/{runtime => local-execution}/src/ops/flat.cc (78%) rename lib/{runtime => local-execution}/src/ops/flat.h (100%) rename lib/{runtime => local-execution}/src/ops/layer_norm.cc (62%) rename lib/{runtime => local-execution}/src/ops/layer_norm.h (100%) rename lib/{runtime => local-execution}/src/ops/linear.cc (55%) rename lib/{runtime => local-execution}/src/ops/linear.h (100%) rename lib/{runtime => local-execution}/src/ops/noop.cc (96%) rename lib/{runtime => local-execution}/src/ops/noop.h (89%) rename lib/{runtime => local-execution}/src/ops/parallel_op.h (96%) rename lib/{runtime => local-execution}/src/ops/partition.cc (59%) rename lib/{runtime => local-execution}/src/ops/pool_2d.cc (58%) rename lib/{runtime => local-execution}/src/ops/pool_2d.h (97%) rename lib/{runtime => local-execution}/src/ops/reduce.cc (60%) rename lib/{runtime => local-execution}/src/ops/reduce.h (98%) rename lib/{runtime => local-execution}/src/ops/reduction.cc (58%) rename lib/{runtime => local-execution}/src/ops/reduction.h (96%) rename lib/{runtime => local-execution}/src/ops/repartition.h (98%) rename lib/{runtime => local-execution}/src/ops/replicate.cc (65%) rename lib/{runtime => local-execution}/src/ops/replicate.h (95%) rename lib/{runtime => local-execution}/src/ops/reshape.cc (68%) rename lib/{runtime => local-execution}/src/ops/reshape.h (98%) rename lib/{runtime => local-execution}/src/ops/reverse.cc (69%) rename lib/{runtime => local-execution}/src/ops/reverse.h (95%) rename lib/{runtime => local-execution}/src/ops/softmax.cc (67%) rename lib/{runtime => local-execution}/src/ops/softmax.h (98%) rename lib/{runtime => local-execution}/src/ops/split.cc (67%) rename lib/{runtime => local-execution}/src/ops/split.h (95%) rename lib/{runtime => local-execution}/src/ops/topk.cc (59%) rename lib/{runtime => local-execution}/src/ops/topk.h (98%) rename lib/{runtime => local-execution}/src/ops/transpose.cc (55%) rename lib/{runtime => local-execution}/src/ops/transpose.h (98%) diff --git a/lib/runtime/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc similarity index 84% rename from lib/runtime/src/ops/attention.cc rename to lib/local-execution/src/ops/attention.cc index 41905f9014..854213a955 100644 --- a/lib/runtime/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -15,19 +15,12 @@ #include "attention.h" #include "kernels/attention_kernels.h" -#include "legion.h" -#include "op-attrs/ops/attention.h" -#include "task_spec/op_task_signature.h" +#include "op_task_signature.h" namespace FlexFlow { using namespace FlexFlow::Kernels::MultiHeadAttention; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { QUERY_PARALLEL_TENSOR_SHAPE, KEY_PARALLEL_TENSOR_SHAPE, @@ -86,6 +79,12 @@ OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) { return {ATTENTION_BWD_TASK_ID, b}; } +// OpArgBacking +// generate_op_arg_backing(std::vector +// tensor_shape_args) { + +// } + static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); @@ -122,35 +121,42 @@ static DeviceSpecific int num_samples = get_piece_shape(query_parallel_tensor_shape)[ff_dim_t(2)]; int num_heads = get_piece_shape(weight_parallel_tensor_shape)[ff_dim_t(1)]; + // MHAPerDeviceState per_device_state = + // init_kernel(handle, + // allocator, + // num_samples, + // num_heads, + // qSize, + // kSize, + // vSize, + // qProjSize, + // kProjSize, + // vProjSize, + // oProjSize, + // qoSeqLength, + // kvSeqLength, + // attrs.add_bias_kv); + // return acc.create_device_specific(per_device_state); + DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - num_samples, - num_heads, - qSize, - kSize, - vSize, - qProjSize, - kProjSize, - vProjSize, - oProjSize, - qoSeqLength, - kvSeqLength, - attrs.add_bias_kv)); + init_kernel(handle, + allocator, + num_samples, + num_heads, + qSize, + kSize, + vSize, + qProjSize, + kProjSize, + vProjSize, + oProjSize, + qoSeqLength, + kvSeqLength, + attrs.add_bias_kv); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto query = acc.get_tensor(QUERY); auto key = acc.get_tensor(KEY); auto value = acc.get_tensor(VALUE); @@ -171,15 +177,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto query = acc.get_tensor(QUERY); auto key = acc.get_tensor(KEY); auto value = acc.get_tensor(VALUE); @@ -221,14 +220,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, MultiHeadAttentionAttrs const &attrs, InputParallelTensorDesc const &query_shape, @@ -286,7 +277,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_arg_slot(QUERY_PARALLEL_TENSOR_SHAPE); init.add_arg_slot(KEY_PARALLEL_TENSOR_SHAPE); init.add_arg_slot(VALUE_PARALLEL_TENSOR_SHAPE); @@ -307,12 +299,18 @@ void register_task() { register_task(ATTENTION_INIT_TASK_ID, "Attention Init", init_signature(), - init_task); + init_task_impl); +} + +template <> +OpTaskSignature get_signature() { + return init_signature(); } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(QUERY); fwd.add_input_slot(KEY); @@ -331,13 +329,13 @@ void register_task() { register_task(ATTENTION_FWD_TASK_ID, "Attention Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> OpTaskSignature bwd_signature() { OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(ATTENTION_FWD_TASK_ID)); + infer_bwd_signature(fwd_signature()); return bwd; } @@ -347,7 +345,7 @@ void register_task() { register_task(ATTENTION_BWD_TASK_ID, "Attention Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/attention.h b/lib/local-execution/src/ops/attention.h similarity index 96% rename from lib/runtime/src/ops/attention.h rename to lib/local-execution/src/ops/attention.h index 09a4ef036f..601d8a4796 100644 --- a/lib/runtime/src/ops/attention.h +++ b/lib/local-execution/src/ops/attention.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_ATTENTION_H #include "op-attrs/ops/attention.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc similarity index 88% rename from lib/runtime/src/ops/batch_matmul.cc rename to lib/local-execution/src/ops/batch_matmul.cc index 5f40def699..c5df564afd 100644 --- a/lib/runtime/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -15,20 +15,14 @@ #include "batch_matmul.h" #include "kernels/batch_matmul_kernels.h" -#include "legion.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/batch_matmul.h" -#include "task_spec/op_task_signature.h" +#include "op_task_signature.h" namespace FlexFlow { using namespace FlexFlow::Kernels::BatchMatmul; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { A_INPUT, // tensor B_INPUT, // tensor @@ -60,7 +54,7 @@ OpTaskInvocation backward(BatchMatmulAttrs const &attrs) { return {BATCHMATMUL_BWD_TASK_ID, bwd}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto a_input = acc.get_tensor(A_INPUT); auto b_input = acc.get_tensor(B_INPUT); auto output = acc.get_tensor(OUTPUT); @@ -105,15 +99,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { iter_config.seq_length); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { // BatchMatmul* bmm = (BatchMatmul*) task->args; FFIterationConfig iter_config = acc.get_argument(ITERATION_CONFIG); @@ -165,14 +152,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { batch); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, BatchMatmulAttrs const &attrs, InputParallelTensorDesc const &a_input, @@ -208,7 +187,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(A_INPUT); fwd.add_input_slot(B_INPUT); @@ -225,7 +205,7 @@ void register_task() { register_task(BATCHMATMUL_FWD_TASK_ID, "BatchMatmul Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -241,7 +221,7 @@ void register_task() { register_task(BATCHMATMUL_BWD_TASK_ID, "BatchMatmul Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h similarity index 90% rename from lib/runtime/src/ops/batch_matmul.h rename to lib/local-execution/src/ops/batch_matmul.h index 7d3f2308da..6791b11a8c 100644 --- a/lib/runtime/src/ops/batch_matmul.h +++ b/lib/local-execution/src/ops/batch_matmul.h @@ -2,9 +2,9 @@ #define _FLEXFLOW_BATCH_MATMUL_H #include "op-attrs/ops/batch_matmul.h" +#include "op_task_invocation.h" +#include "op_task_signature.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" -#include "task_spec/op_task_signature.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc similarity index 81% rename from lib/runtime/src/ops/batch_norm.cc rename to lib/local-execution/src/ops/batch_norm.cc index a52981a8a3..dadfab14e0 100644 --- a/lib/runtime/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -15,17 +15,11 @@ #include "batch_norm.h" #include "kernels/batch_norm_kernels.h" -#include "legion/legion_utilities.h" namespace FlexFlow { using namespace FlexFlow::Kernels::BatchNorm; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { INPUT, // tensor SCALE, // tensor @@ -88,29 +82,19 @@ static DeviceSpecific float *runningMean; DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - runningMean, - output_n, - output_c, - output_h, - output_w, - attrs.relu)); + init_kernel(handle, + allocator, + runningMean, + output_n, + output_c, + output_h, + output_w, + attrs.relu); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -123,22 +107,15 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, "[BatchNorm] forward_time = %.2lfms\n", - &per_device_state, + per_device_state, input.get_float_ptr(), output.get_float_ptr(), scale.get_float_ptr(), bias.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -154,7 +131,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, "[BatchNorm] backward_time = %.2lfms\n", - &per_device_state, + per_device_state, input.get_float_ptr(), output_grad.get_float_ptr(), output.get_float_ptr(), @@ -165,14 +142,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output.shape.get_volume()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, BatchNormAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -220,7 +189,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_input_slot(INPUT); init.add_input_slot(BIAS); init.add_output_slot(OUTPUT); @@ -236,12 +206,13 @@ void register_task() { register_task(BATCHNORM_INIT_TASK_ID, "BatchNorm Init", init_signature(), - init_task); + init_task_impl); } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); fwd.add_input_slot(SCALE); @@ -258,7 +229,7 @@ void register_task() { register_task(BATCHNORM_FWD_TASK_ID, "BatchNorm Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -274,7 +245,7 @@ void register_task() { register_task(BATCHNORM_BWD_TASK_ID, "BatchNorm Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/batch_norm.h b/lib/local-execution/src/ops/batch_norm.h similarity index 95% rename from lib/runtime/src/ops/batch_norm.h rename to lib/local-execution/src/ops/batch_norm.h index 906e85a57c..6fae871c2c 100644 --- a/lib/runtime/src/ops/batch_norm.h +++ b/lib/local-execution/src/ops/batch_norm.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_BATCH_NORM_H #include "op-attrs/ops/batch_norm.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc similarity index 80% rename from lib/runtime/src/ops/cast.cc rename to lib/local-execution/src/ops/cast.cc index 44230eaf46..0914ea40a6 100644 --- a/lib/runtime/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -15,17 +15,12 @@ #include "cast.h" #include "kernels/cast_kernels.h" -#include "legion/legion_utilities.h" -#include "task_spec/op_task_signature.h" + +#include "op_task_signature.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Cast; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - namespace FlexFlow { enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; @@ -48,7 +43,7 @@ OpTaskInvocation backward(CastAttrs const &attrs) { return {CAST_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -64,15 +59,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { attrs.dtype); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -90,14 +78,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { attrs.dtype); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, CastAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -127,7 +107,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); @@ -143,7 +124,7 @@ void register_task() { register_task(CAST_FWD_TASK_ID, "Cast Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -158,7 +139,7 @@ void register_task() { register_task(CAST_BWD_TASK_ID, "Cast Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/cast.h b/lib/local-execution/src/ops/cast.h similarity index 97% rename from lib/runtime/src/ops/cast.h rename to lib/local-execution/src/ops/cast.h index c0c500e869..ce9a93aa32 100644 --- a/lib/runtime/src/ops/cast.h +++ b/lib/local-execution/src/ops/cast.h @@ -16,8 +16,8 @@ #define _FLEXFLOW_CAST_H #include "op-attrs/ops/cast.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc similarity index 77% rename from lib/runtime/src/ops/combine.cc rename to lib/local-execution/src/ops/combine.cc index 46d5ebb4fe..942d964021 100644 --- a/lib/runtime/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -15,15 +15,11 @@ #include "combine.h" #include "kernels/combine_kernels.h" -#include "task_spec/op_task_invocation.h" +#include "op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; using namespace FlexFlow::Kernels::Combine; @@ -46,7 +42,7 @@ OpTaskInvocation backward(CombineAttrs const &attrs) { return {COMBINE_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -59,15 +55,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -80,14 +69,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, CombineAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -103,7 +84,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); @@ -117,7 +99,7 @@ void register_task() { register_task(COMBINE_FWD_TASK_ID, "Combine Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -133,7 +115,7 @@ void register_task() { register_task(COMBINE_BWD_TASK_ID, "Combine Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/combine.h b/lib/local-execution/src/ops/combine.h similarity index 94% rename from lib/runtime/src/ops/combine.h rename to lib/local-execution/src/ops/combine.h index 6b3a43863b..5923e9ebcc 100644 --- a/lib/runtime/src/ops/combine.h +++ b/lib/local-execution/src/ops/combine.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_COMBINE_H #include "op-attrs/ops/combine.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc similarity index 79% rename from lib/runtime/src/ops/concat.cc rename to lib/local-execution/src/ops/concat.cc index 1ce549cc57..3d62c19f20 100644 --- a/lib/runtime/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -15,21 +15,16 @@ #include "concat.h" #include "kernels/concat_kernels.h" -#include "legion/legion_utilities.h" + #include "op-attrs/get_output_shapes.h" -#include "task_spec/op_task_signature.h" -#include "task_spec/variadic_tensor_ref.h" +#include "op_task_signature.h" #include "utils/hash-utils.h" +#include "variadic_tensor_ref.h" namespace FlexFlow { using namespace FlexFlow::Kernels::Concat; -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS }; OpTaskInvocation forward(ConcatAttrs const &attrs) { @@ -48,7 +43,7 @@ OpTaskInvocation backward(ConcatAttrs const &attrs) { return {CONCAT_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -65,15 +60,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { attrs.axis); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto const &attrs = acc.get_argument(ATTRS); @@ -90,14 +78,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { attrs.axis); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, ConcatAttrs const &attrs, @@ -131,7 +111,8 @@ CostMetrics template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUTS, SlotType::VARIADIC); @@ -145,13 +126,13 @@ void register_task() { register_task(CONCAT_FWD_TASK_ID, "Concat Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> OpTaskSignature bwd_signature() { OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(CONCAT_FWD_TASK_ID)); + infer_bwd_signature(fwd_signature()); return bwd; } @@ -161,7 +142,7 @@ void register_task() { register_task(CONCAT_BWD_TASK_ID, "Concat Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/concat.h b/lib/local-execution/src/ops/concat.h similarity index 95% rename from lib/runtime/src/ops/concat.h rename to lib/local-execution/src/ops/concat.h index 27dec47743..d0a432e8b3 100644 --- a/lib/runtime/src/ops/concat.h +++ b/lib/local-execution/src/ops/concat.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_CONCAT_H #include "op-attrs/ops/concat.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc similarity index 78% rename from lib/runtime/src/ops/conv_2d.cc rename to lib/local-execution/src/ops/conv_2d.cc index 01d8abab55..0df15e9b23 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -1,17 +1,10 @@ #include "conv_2d.h" #include "kernels/conv_2d_kernels.h" -#include "legion/legion_utilities.h" -#include "mpark/variant.hpp" #include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::Conv2D; enum Slots { @@ -70,33 +63,23 @@ static DeviceSpecific auto filter_grad = acc.get_tensor_grad(FILTER); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - attrs.activation, - attrs.kernel_h, - attrs.kernel_w, - attrs.groups, - attrs.padding_h, - attrs.padding_w, - attrs.stride_h, - attrs.stride_w, - input, - output, - filter.get_float_ptr(), - filter_grad.get_float_ptr())); + init_kernel(handle, + attrs.activation, + attrs.kernel_h, + attrs.kernel_w, + attrs.groups, + attrs.padding_h, + attrs.padding_w, + attrs.stride_h, + attrs.stride_w, + input, + output, + filter.get_float_ptr(), + filter_grad.get_float_ptr()); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -118,15 +101,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { attrs.activation); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -155,14 +131,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { attrs.activation); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, Conv2DAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -210,7 +178,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_input_slot(INPUT); init.add_output_slot(OUTPUT); @@ -228,12 +197,13 @@ void register_task() { register_task(CONV2D_INIT_TASK_ID, "Conv2d Init", init_signature(), - init_task); + init_task_impl); } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); @@ -252,7 +222,7 @@ void register_task() { register_task(CONV2D_FWD_TASK_ID, "Conv2d Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -268,7 +238,7 @@ void register_task() { register_task(CONV2D_BWD_TASK_ID, "Conv2d Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/local-execution/src/ops/conv_2d.h similarity index 95% rename from lib/runtime/src/ops/conv_2d.h rename to lib/local-execution/src/ops/conv_2d.h index 7225099a47..0e92b00553 100644 --- a/lib/runtime/src/ops/conv_2d.h +++ b/lib/local-execution/src/ops/conv_2d.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_CONV_2D_H #include "op-attrs/ops/conv_2d.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc similarity index 77% rename from lib/runtime/src/ops/dropout.cc rename to lib/local-execution/src/ops/dropout.cc index fe85afea38..236b7e2c88 100644 --- a/lib/runtime/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -1,18 +1,12 @@ #include "dropout.h" #include "kernels/dropout_kernels.h" -#include "legion/legion_utilities.h" #include "op-attrs/get_output_shapes.h" -#include "task_spec/op_task_invocation.h" -#include "task_spec/task_signature.h" +#include "op_task_invocation.h" +#include "op_task_signature.h" #include "utils/hash-utils.h" namespace FlexFlow { -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::Dropout; enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING }; @@ -54,21 +48,11 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator)); + init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -83,15 +67,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -108,14 +85,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { input_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, DropoutAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -155,7 +124,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(FF_HANDLE); init.add_output_slot(OUTPUT); @@ -170,12 +140,13 @@ void register_task() { register_task(DROPOUT_INIT_TASK_ID, "Dropout Init", init_signature(), - init_task); + init_task_impl); } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); @@ -191,7 +162,7 @@ void register_task() { register_task(DROPOUT_FWD_TASK_ID, "Dropout Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -207,7 +178,7 @@ void register_task() { register_task(DROPOUT_BWD_TASK_ID, "Dropout Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/dropout.h b/lib/local-execution/src/ops/dropout.h similarity index 95% rename from lib/runtime/src/ops/dropout.h rename to lib/local-execution/src/ops/dropout.h index 88a255d140..4f22842c8a 100644 --- a/lib/runtime/src/ops/dropout.h +++ b/lib/local-execution/src/ops/dropout.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_DROPOUT_H #include "op-attrs/ops/dropout.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" #include "tasks.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc similarity index 81% rename from lib/runtime/src/ops/element_binary.cc rename to lib/local-execution/src/ops/element_binary.cc index f6be2198ca..0cec2b8d0a 100644 --- a/lib/runtime/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -1,16 +1,11 @@ #include "element_binary.h" #include "kernels/element_binary_kernels.h" -#include "legion/legion_utilities.h" + #include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::ElementBinary; enum Slots { @@ -66,27 +61,17 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - attrs.type, - attrs.should_broadcast_lhs, - attrs.should_broadcast_rhs, - input_lhs.shape, - input_rhs.shape, - output.shape)); + init_kernel(handle, + attrs.type, + attrs.should_broadcast_lhs, + attrs.should_broadcast_rhs, + input_lhs.shape, + input_rhs.shape, + output.shape); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -109,15 +94,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { handle); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -146,14 +124,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { handle); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, ElementBinaryAttrs const &attrs, @@ -203,7 +173,8 @@ CostMetrics template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_input_slot(LHS_INPUT); init.add_input_slot(RHS_INPUT); @@ -221,12 +192,13 @@ void register_task() { register_task(ELEMENTBINARY_INIT_TASK_ID, "ElementBinary Init", init_signature(), - init_task); + init_task_impl); } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); @@ -245,7 +217,7 @@ void register_task() { register_task(ELEMENTBINARY_FWD_TASK_ID, "ElementBinary Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -261,7 +233,7 @@ void register_task() { register_task(ELEMENTBINARY_BWD_TASK_ID, "ElementBinary Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/element_binary.h b/lib/local-execution/src/ops/element_binary.h similarity index 100% rename from lib/runtime/src/ops/element_binary.h rename to lib/local-execution/src/ops/element_binary.h diff --git a/lib/runtime/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc similarity index 77% rename from lib/runtime/src/ops/element_unary.cc rename to lib/local-execution/src/ops/element_unary.cc index f41a8b3551..9567fc1570 100644 --- a/lib/runtime/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -1,15 +1,11 @@ #include "element_unary.h" #include "kernels/element_unary_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; using namespace FlexFlow::Kernels::ElementUnary; @@ -27,7 +23,6 @@ enum Slots { OpTaskInvocation init(ElementUnaryUnifiedAttrs const &attrs) { OpTaskBinding b; - b.bind_arg(HANDLE, ff_handle()); b.bind_arg(ATTRS, attrs); b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0)); @@ -58,32 +53,21 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); ProfilingSettings profiling = acc.get_argument(PROFILING); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); ParallelTensorShape input_shape = acc.get_argument(INPUT_SHAPE); ParallelTensorShape output_shape = get_output_shape(attrs, input_shape); - DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(input_shape, output_shape, attrs)); + DeviceSpecific per_device_state = init_kernel( + get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - auto &handle = acc.get_argument(HANDLE); + auto handle = acc.get_argument(HANDLE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = @@ -99,22 +83,15 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - auto &handle = acc.get_argument(HANDLE); + auto handle = acc.get_argument(HANDLE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -132,14 +109,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, ElementUnaryUnifiedAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -147,7 +116,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, MachineView const &mv) { auto env = sim.new_environment(); - ParallelTensorShape output_shape = get_output_shape(attrs, input_shape); + ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape); SimTaskBinding init_binding; init_binding.bind_arg(HANDLE, ff_handle()); @@ -181,7 +150,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_arg_slot(INPUT_SHAPE); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); @@ -196,12 +166,13 @@ void register_task() { register_task(ELEMENTUNARY_INIT_TASK_ID, "ElementUnary Init", init_signature(), - init_task); + init_task_impl); } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); @@ -217,7 +188,7 @@ void register_task() { register_task(ELEMENTUNARY_FWD_TASK_ID, "ElementUnary Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -233,7 +204,7 @@ void register_task() { register_task(ELEMENTUNARY_BWD_TASK_ID, "ElementUnary Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/element_unary.h b/lib/local-execution/src/ops/element_unary.h similarity index 91% rename from lib/runtime/src/ops/element_unary.h rename to lib/local-execution/src/ops/element_unary.h index f44efc28db..83f6177b8d 100644 --- a/lib/runtime/src/ops/element_unary.h +++ b/lib/local-execution/src/ops/element_unary.h @@ -7,9 +7,6 @@ namespace FlexFlow { -using ElementUnaryUnifiedAttrs = - variant; - template <> void register_task(); template <> diff --git a/lib/runtime/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc similarity index 82% rename from lib/runtime/src/ops/embedding.cc rename to lib/local-execution/src/ops/embedding.cc index a1bc915d2f..31dc83814f 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/local-execution/src/ops/embedding.cc @@ -15,17 +15,11 @@ #include "embedding.h" #include "kernels/embedding_kernels.h" -#include "legion.h" +#include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/embedding.h" namespace FlexFlow { -// declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - using namespace FlexFlow::Kernels::Embedding; enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING }; @@ -49,7 +43,7 @@ OpTaskInvocation backward(EmbeddingAttrs const &attrs) { return {EMBED_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); @@ -71,15 +65,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { input.shape[legion_dim_t(1)]); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); @@ -98,15 +85,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), - input.shape[ff_dim_t(0)]); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + input.shape.at(ff_dim_t(0))); } CostMetrics measure_operator_cost(SimEnvFactory const &sim, @@ -141,7 +120,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); fwd.add_input_slot(OUTPUT); @@ -158,7 +138,7 @@ void register_task() { register_task(EMBED_FWD_TASK_ID, "Embed Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -172,7 +152,7 @@ void register_task() { register_task(EMBED_BWD_TASK_ID, "Embed Bwd", bwd_signature(), - backward_task); + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/embedding.h b/lib/local-execution/src/ops/embedding.h similarity index 94% rename from lib/runtime/src/ops/embedding.h rename to lib/local-execution/src/ops/embedding.h index cd1b14fa66..b4caebf952 100644 --- a/lib/runtime/src/ops/embedding.h +++ b/lib/local-execution/src/ops/embedding.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_EMBEDDING_H #include "op-attrs/ops/embedding.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc similarity index 78% rename from lib/runtime/src/ops/flat.cc rename to lib/local-execution/src/ops/flat.cc index f53a6185b6..45d3805e0c 100644 --- a/lib/runtime/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -5,10 +5,6 @@ namespace FlexFlow { // declare Legion names -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; using namespace FlexFlow::Kernels::Flat; @@ -30,7 +26,7 @@ OpTaskInvocation backward(FlatAttrs const &attrs) { return {FLAT_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -42,15 +38,8 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -65,14 +54,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim, FlatAttrs const &attrs, InputParallelTensorDesc const &input_shape, @@ -101,7 +82,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); @@ -115,7 +97,7 @@ void register_task() { register_task(FLAT_FWD_TASK_ID, "Flat Fwd", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -130,7 +112,7 @@ void register_task() { register_task(FLAT_BWD_TASK_ID, "Flat Bwd", bwd_signature(), - backward_task); + backward_task_impl); } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/flat.h b/lib/local-execution/src/ops/flat.h similarity index 100% rename from lib/runtime/src/ops/flat.h rename to lib/local-execution/src/ops/flat.h diff --git a/lib/runtime/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc similarity index 62% rename from lib/runtime/src/ops/layer_norm.cc rename to lib/local-execution/src/ops/layer_norm.cc index 6bc671c249..3caf95c068 100644 --- a/lib/runtime/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -15,26 +15,27 @@ #include "layer_norm.h" #include "kernels/layer_norm_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/layer_norm.h" #include "op-attrs/parallel_tensor_shape.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/hash-utils.h" #include -using Legion::Context; -using Legion::PhysicalRegion; -using Legion::Runtime; -using Legion::Task; - namespace FlexFlow { +using namespace FlexFlow::Kernels::LayerNorm; + enum Slots { PROFILING, INPUT, + INPUT_GRAD, OUTPUT, + OUTPUT_GRAD, GAMMA, + GAMMA_GRAD, BETA, + BETA_GRAD, PER_DEVICE_STATE, ATTRS, HANDLE @@ -59,7 +60,7 @@ OpTaskInvocation forward(LayerNormAttrs const &attrs) { b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem b.bind(BETA, weight_tensor(1)); // how to get gmmam and beta b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, per_device_state()); + b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); return {LAYERNORM_FWD_TASK_ID, b}; } @@ -70,11 +71,11 @@ OpTaskInvocation backward(LayerNormAttrs const &attrs) { return {LAYERNORM_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto gamma = acc.get_tensor(GAMMA); - auto beta = acc.get_tensor(BETA); +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto gamma = acc.get_tensor(GAMMA); + auto beta = acc.get_tensor(BETA); ProfilingSettings profiling = acc.get_argument(PROFILING); auto &state = acc.get_argument(PER_DEVICE_STATE); @@ -83,28 +84,21 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[LayerNorm] forward time = %.2lfms\n", state, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - beta.get_float_ptr()); + input, + output, + gamma, + beta); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto gamma = acc.get_tensor(GAMMA); -static optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto gamma = acc.get_tensor(GAMMA); - - auto input_grad = acc.get_tensor(INPUT_GRAD); - auto gamma_grad = acc.get_tensor(GAMMA_GRAD); - auto beta_grad = acc.get_tensor(BETA_GRAD); - auto output_grad = acc.get_tensor(OUTPUT_GRAD); + auto input_grad = acc.get_tensor(INPUT_GRAD); + auto gamma_grad = acc.get_tensor(GAMMA_GRAD); + auto beta_grad = acc.get_tensor(BETA_GRAD); + auto output_grad = acc.get_tensor(OUTPUT_GRAD); ProfilingSettings profiling = acc.get_argument(PROFILING); auto &state = acc.get_argument(PER_DEVICE_STATE); @@ -113,28 +107,20 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[LayerNorm] backward time = %.2lfms\n", state, - output_grad.get_float_ptr(), - input.get_float_ptr(), - input_grad.get_float_ptr(), - gamma.get_float_ptr(), - gamma_grad.get_float_ptr(), - beta_grad.get_float_ptr()); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + output_grad, + input, + input_grad, + gamma, + gamma_grad, + beta_grad); } static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); + auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); - auto input = acc.get_tensor(INPUT); - FFHandler handle = acc.get_argument(HANDLE); + auto input = acc.get_tensor(INPUT); + auto handle = acc.get_argument(HANDLE); // question: how to get batch_size and effective_num_elements int64_t effective_batch_size, effective_num_elements; @@ -143,48 +129,39 @@ static DeviceSpecific M *= input.shape.at(legion_dim_t(attrs.axes[i])); } int num_replicas = 1; - for (int i = 0; i < intput.shape.num_dims(); i++) { + for (int i = 0; i < input.shape.num_dims(); i++) { num_replicas *= input.shape.at(legion_dim_t(i)); effective_num_elements = M; effective_batch_size = input.shape.get_volume() / M; DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, - allocator, - attrs.elementwise_affine, - effective_batch_size, - effective_num_elements, - attrs.eps)); + init_kernel(handle, + allocator, + attrs.elementwise_affine, + effective_batch_size, + effective_num_elements, + attrs.eps); } } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LayerNormAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; init_binding.bind_arg(HANDLE, ff_handle()); init_binding.bind_arg(ATTRS, attrs); - init.binding.bind(INPUT, input.shape); + init_binding.bind(INPUT, input.shape); auto init_accessor = env.get_init_accessor(LAYERNORM_INIT_TASK_ID, init_binding); - DeviceSpecific = init_task_impl(init_accessor); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); SimTaskBinding fwd_binding; fwd_binding.bind(INPUT, input.shape); @@ -193,8 +170,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); // TODO how to handle gamma and beta, where are they from - fwd_binding.bind(GAMMA, input_shape); - fwd_binding.bind(BETA, input_shape); + fwd_binding.bind(GAMMA, input.shape); + fwd_binding.bind(BETA, input.shape); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); auto fwd_accessor = env.get_fwd_accessor(LAYERNORM_FWD_TASK_ID, fwd_binding); @@ -209,7 +186,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); @@ -222,7 +200,7 @@ OpTaskSignature fwd_signature() { } template <> -OpTaskSignature bwd_signature() { +OpTaskSignature bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(fwd_signature()); return bwd; @@ -230,7 +208,8 @@ OpTaskSignature bwd_signature() { template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); @@ -245,7 +224,7 @@ void register_task() { register_task(LAYERNORM_INIT_TASK_ID, "LayerNorm init", init_signature(), - init_task); + init_task_impl); } template <> @@ -253,15 +232,15 @@ void register_task() { register_task(LAYERNORM_FWD_TASK_ID, "LayerNorm forward", fwd_signature(), - forward_task); + forward_task_impl); } template <> void register_task() { register_task(LAYERNORM_BWD_TASK_ID, "LayerNorm backward", - bwd_signature(), - backward_task); + bwd_signature(), + backward_task_impl); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/layer_norm.h b/lib/local-execution/src/ops/layer_norm.h similarity index 100% rename from lib/runtime/src/ops/layer_norm.h rename to lib/local-execution/src/ops/layer_norm.h diff --git a/lib/runtime/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc similarity index 55% rename from lib/runtime/src/ops/linear.cc rename to lib/local-execution/src/ops/linear.cc index 96d037913c..2d13909c09 100644 --- a/lib/runtime/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -1,32 +1,14 @@ #include "linear.h" #include "kernels/linear_kernels.h" -#include "layer.h" -#include "legion/legion_utilities.h" #include "op-attrs/ff_dim.h" #include "op-attrs/get_output_shapes.h" -#include "utils/exceptions.h" +#include "task_argument_accessor.h" +#include "utils/exception.h" #include "utils/graph/views.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - using namespace FlexFlow::Kernels::Linear; enum slots { @@ -43,12 +25,12 @@ enum slots { OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskBinding binding; - bind.bind_arg(HANDLE, ff_handle()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(ATTRS, attrs); - bind.bind(INPUT, input_tensor(0)); // input - bind.bind(WEIGHT, weight_tensor(0)); // weight - bind.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); // input + binding.bind(WEIGHT, weight_tensor(0)); // weight + binding.bind(OUTPUT, output_tensor(0)); // output return {LINEAR_INIT_TASK_ID, binding}; } @@ -56,14 +38,17 @@ OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskInvocation forward(LinearAttrs const &attrs) { OpTaskBinding binding; - bind.bind(INPUT, input_tensor(0)); // input - bind.bind(WEIGHT, weight_tensor(0)); // weight - bind.bind(OUTPUT, output_tensor(0)); // output - bind.bind(BIAS, bias_tensor(0)); // bias + binding.bind(INPUT, input_tensor(0)); // input + binding.bind(WEIGHT, weight_tensor(0)); // weight + binding.bind(OUTPUT, output_tensor(0)); // output + if (attrs.use_bias) { + binding.bind(BIAS, weight_tensor(1)); // bias + } - bing.bind_arg(PROFILING, profiling_settings()); - bind.bind_arg(PER_DEVICE_STATE, per_device_state()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + binding.bind_arg(ATTRS, attrs); return {LINEAR_FWD_TASK_ID, binding}; } @@ -74,51 +59,38 @@ OpTaskInvocation backward(LinearAttrs const &attrs) { return {LINEAR_BWD_TASK_ID, b}; } -static DeviceSpecific - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - Allocator allocator = acc.get_allocator(); +static LinearPerDeviceState init_task_impl(TaskArgumentAccessor const &acc) { + auto const &attrs = acc.get_argument(ATTRS); PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); int out_dim = output.shape.at(ff_dim_t{0}); - int batch_size = output.shape.at.(ff_dim_t{1}); + int batch_size = output.shape.at(ff_dim_t{1}); float *one_ptr; - DeviceSpecific state = - acc.create_device_specific( - init_kernel(handle, - allocator, - one_ptr, - attrs.regularizer, - attrs.use_bias, - input.data_type, - weight.data_type, - output.data_type, - batch_size, - attrs.out_channels)); + LinearPerDeviceState state = init_kernel(handle, + one_ptr, + attrs.regularizer, + attrs.use_bias, + input.data_type, + weight.data_type, + output.data_type, + batch_size, + attrs.out_channels); return state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); auto bias = acc.get_tensor(BIAS); - auto state = acc.get_device_specific(PER_DEVICE_STATE); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); @@ -144,15 +116,10 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { batch_size); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -}; +; -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); @@ -161,7 +128,8 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); auto output_grad = acc.get_tensor_grad(OUTPUT); - auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); @@ -178,63 +146,61 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Linear] backward_time = %.2lfms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), - output.get_float_ptr(), - output_grad.get_float_ptr(), - weight.get_float_ptr(), - weight_grad.get_float_ptr(), - bias_ptr, + (void *)input.get_float_ptr(), + (void *)input_grad.get_float_ptr(), + (void *)output.get_float_ptr(), + (void *)output_grad.get_float_ptr(), + (void *)weight.get_float_ptr(), + (void *)weight_grad.get_float_ptr(), + (void *)bias_ptr, in_dim, out_dim, batch_size); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LinearAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); - ParallelTensorShape output_shape = get_output_shape(input.shape, attrs); + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + ParallelTensorShape weight_shape = get_weights_shape(attrs, input.shape); + ParallelTensorShape bias_shape = get_bias_shape(attrs, input.shape); SimTaskBinding init_binding; - init_binding.bind(INPUT, input_tensor(0)); - init_binding.bind(WEIGHT, weight_tensor(0)); - init_binding.bind(BIAS, bias_tensor(0)); - init_binding.bind(OUTPUT, output_tensor(0)); + init_binding.bind(INPUT, input.shape); + init_binding.bind(WEIGHT, weight_shape); + if (attrs.use_bias) { + init_binding.bind(BIAS, bias_shape); + } + init_binding.bind(OUTPUT, output_shape); init_binding.bind_arg(ATTRS, attrs); init_binding.bind_arg(HANDLE, ff_handle()); auto init_accessor = env.get_init_accessor(LINEAR_INIT_TASK_ID, init_binding); - DeviceSpecific per_device_state = - init_task_impl(init_accessor); + LinearPerDeviceState per_device_state = init_task_impl(init_accessor); SimTaskBinding fwd_binding; - fwd_bind.bind(INPUT, input_tensor(0)); // input - fwd_bind.bind(WEIGHT, weight_tensor(0)); // weight - fwd_bind.bind(OUTPUT, output_tensor(0)); // output - fwd_bind.bind(BIAS, bias_tensor(0)); // bias + fwd_binding.bind(INPUT, input.shape); // input + fwd_binding.bind(WEIGHT, weight_shape); // weight + fwd_binding.bind(OUTPUT, output_shape); // output + if (attrs.use_bias) { + fwd_binding.bind(BIAS, bias_shape); // bias + } - fwd_bid.bind_arg(PROFILING, profiling_settings()); - fwd_bind.bind_arg(PER_DEVICE_STATE, per_device_state()); - fwd_bind.bind_arg(ATTRS, attrs); + fwd_binding.bind_arg(PROFILING, profiling_settings()); + fwd_binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + fwd_binding.bind_arg(ATTRS, attrs); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_accessor(LINEAR_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_accessor(LINEAR_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = env.get_fwd_accessor(LINEAR_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(LINEAR_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); @@ -245,27 +211,28 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> OpTaskSignature init_signature() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_input_slot(INPUT); - init.add_input_slot(WEIGHT); - init.add_input_slot(BIAS); + init.add_weight_slot(WEIGHT); init.add_output_slot(OUTPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); - return init, + return init; } template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); - fwd.add_input_slot(WEIGHT); - fwd.add_input_slot(BIAS); + fwd.add_weight_slot(WEIGHT); + fwd.add_optional_weight_slot(BIAS); fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); @@ -281,13 +248,28 @@ OpTaskSignature bwd_signature() { return bwd; } +template <> +TaskImplFunction get_task_impl() { + return init_task_impl; +} + +template <> +TaskImplFunction get_task_impl() { + return forward_task_impl; +} + +template <> +TaskImplFunction get_task_impl() { + return backward_task_impl; +} + template <> void register_task() { register_task(LINEAR_INIT_TASK_ID, "Linear::init_task", init_signature(), - init_task); + init_task_impl); } template <> @@ -295,7 +277,7 @@ void register_task() { register_task(LINEAR_FWD_TASK_ID, "Linear::fwd_task", fwd_signature(), - forward_task); + forward_task_impl); } template <> @@ -303,7 +285,11 @@ void register_task() { register_task(LINEAR_BWD_TASK_ID, "Linear::bwd_task", bwd_signature(), - backward_task); + backward_task_impl); +} + +std::vector get_task_ids(LinearAttrs const &) { + return {LINEAR_INIT_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID}; } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/linear.h b/lib/local-execution/src/ops/linear.h similarity index 100% rename from lib/runtime/src/ops/linear.h rename to lib/local-execution/src/ops/linear.h diff --git a/lib/runtime/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc similarity index 96% rename from lib/runtime/src/ops/noop.cc rename to lib/local-execution/src/ops/noop.cc index 6b8510607a..02ffeaf111 100644 --- a/lib/runtime/src/ops/noop.cc +++ b/lib/local-execution/src/ops/noop.cc @@ -14,7 +14,7 @@ */ #include "noop.h" -#include "task_spec/op_task_invocation.h" +#include "op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/noop.h b/lib/local-execution/src/ops/noop.h similarity index 89% rename from lib/runtime/src/ops/noop.h rename to lib/local-execution/src/ops/noop.h index f5cf6cc98c..17a9426e77 100644 --- a/lib/runtime/src/ops/noop.h +++ b/lib/local-execution/src/ops/noop.h @@ -3,7 +3,7 @@ #include "op-attrs/ops/input.h" #include "op-attrs/ops/noop.h" -#include "task_spec/op_task_invocation.h" +#include "op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/parallel_op.h b/lib/local-execution/src/ops/parallel_op.h similarity index 96% rename from lib/runtime/src/ops/parallel_op.h rename to lib/local-execution/src/ops/parallel_op.h index 6b596a4fb5..e7bd98b8a8 100644 --- a/lib/runtime/src/ops/parallel_op.h +++ b/lib/local-execution/src/ops/parallel_op.h @@ -7,7 +7,7 @@ namespace FlexFlow { struct ParallelOpJoinResult { - optional op = nullopt; + std::optional op = std::nullopt; bool join_did_succeed = false; }; diff --git a/lib/runtime/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc similarity index 59% rename from lib/runtime/src/ops/partition.cc rename to lib/local-execution/src/ops/partition.cc index 2a974e96da..c6e5bce64d 100644 --- a/lib/runtime/src/ops/partition.cc +++ b/lib/local-execution/src/ops/partition.cc @@ -13,32 +13,13 @@ * limitations under the License. */ -#include "parallel_ops/partition.h" #include "kernels/partition_kernels.h" -#include "op-attrs/get_output_shape.h" -#include "utils/exceptions.h" +#include "op-attrs/get_output_shapes.h" +#include "repartition.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Repartition; @@ -59,7 +40,7 @@ OpTaskInvocation forward(RepartitionAttrs const &attrs) { binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); binding.bind_arg(PER_DEVICE_STATE, - per_device_state()); + per_device_op_state()); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -79,64 +60,39 @@ static DeviceSpecific // Note: use the input data type DeviceSpecific per_device_state = - acc.create_device_specific_state( - init_kernel(handle, input.data_type)); + init_kernel(handle, input.data_type); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - return profiling(forward, - profiling, - "[Reparition/Partition] forward_time = %.2lfms\n", - per_device_state, - input, - output); -} - -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); + return profile(forward_kernel, + profiling, + "[Reparition/Partition] forward_time = %.2lfms\n", + per_device_state, + input, + output); } -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); - return profiling(backward, - profiling, - "[Reparition/Partition] backward_time = %.2lfms\n", - per_device_state, - input_grad, - output_grad); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + return profile(backward_kernel, + profiling, + "[Reparition/Partition] backward_time = %.2lfms\n", + per_device_state, + output_grad, + input_grad); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -144,7 +100,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); @@ -165,8 +121,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto fwd_accessor = env.get_accessor(REPARTITION_FWD_TASK_ID, fwd_binding); - auto bwd_accessor = env.get_accessor(REPARTITION_BWD_TASK_ID, bwd_binding); + auto fwd_accessor = + env.get_fwd_accessor(REPARTITION_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = + env.get_bwd_accessor(REPARTITION_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); @@ -177,7 +135,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_unchecked_arg_slot(HANDLE); @@ -185,27 +144,33 @@ void register_task() { init.add_return_value(); - register_task(REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task); + register_task( + REPARTITION_INIT_TASK_ID, "Repartition Init", init, init_task_impl); } template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - register_task(REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task); + register_task( + REPARTITION_FWD_TASK_ID, "Repartition Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REPARTITION_FWD_TASK_ID)); + +// register_task(REPARTITION_BWD_TASK_ID, "Repartition Bwd", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc similarity index 58% rename from lib/runtime/src/ops/pool_2d.cc rename to lib/local-execution/src/ops/pool_2d.cc index 577837c960..32bc5d1616 100644 --- a/lib/runtime/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -1,10 +1,10 @@ #include "pool_2d.h" #include "kernels/pool_2d_kernels.h" -#include "legion/legion_utilities.h" + #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/pool_2d.h" #include "utils/exception.decl.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Pool2D; @@ -23,13 +23,13 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) { return {POOL2D_INIT_TASK_ID, binding}; } -static DeviceSpecific +static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); int input_w = input.shape.at(ff_dim_t(0)) + 1; int input_h = input.shape.at(ff_dim_t(1)) + 1; @@ -64,37 +64,27 @@ static DeviceSpecific printf("Warning: changing pool_padding_w to satisfy output_w size\n"); } - DeviceSpecific state = acc.create_device_specific( - init_kernel(handle, - attrs.activation, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - pad_h, - pad_w, - attrs.kernel_h, - attrs.kernel_w, - attrs.stride_h, - attrs.stride_w, - attrs.pool_type); + DeviceSpecific state = init_kernel(handle, + attrs.activation, + input_w, + input_h, + input_c, + input_n, + output_w, + output_h, + output_c, + output_n, + pad_h, + pad_w, + attrs.kernel_h, + attrs.kernel_w, + attrs.stride_h, + attrs.stride_w, + attrs.pool_type); return state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - OpTaskInvocation forward(Pool2DAttrs const &attrs) { OpTaskBinding binding; binding.bind(INPUT, input_tensor(0)); @@ -102,53 +92,46 @@ OpTaskInvocation forward(Pool2DAttrs const &attrs) { binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state()); return {POOL2D_FWD_TASK_ID, binding}; } -OpTaskInvocation backward(Pool2DAttrs const &) { +OpTaskInvocation backward(Pool2DAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); return {POOL2D_BWD_TASK_ID, b}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - Pool2dPerDeviceState state = - acc.get_argument(PER_DEVICE_STATE); + Pool2DPerDeviceState state = + acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, - profilng, + profiling, "[Pool2D] forward_time = %.2lfms\n", state, input.get_float_ptr(), output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - Pool2dPerDeviceState state = - acc.get_argument(PER_DEVICE_STATE); + Pool2DPerDeviceState state = + acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto output_grad = acc.get_tensor(OUTPUT); return profile(backward_kernel, - profilng, + profiling, "[Pool2D] backward_time = %.2lfms\n", state, input.get_float_ptr(), @@ -157,20 +140,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, Pool2DAttrs const &attrs, - ParallelTensorShape const &input, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; @@ -181,21 +156,21 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, auto init_accessor = env.get_init_accessor(POOL2D_INIT_TASK_ID, init_binding); - DeviceSpecific per_device_state = + DeviceSpecific per_device_state = init_task_impl(init_accessor); SimTaskBinding fwd_binding; - fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(INPUT, input.shape); fwd_binding.bind(OUTPUT, output_shape); fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); - auto fwd_accessor = env.get_accessor(POOL2D_FWD_TASK_ID, fwd_binding); + auto fwd_accessor = env.get_fwd_accessor(POOL2D_FWD_TASK_ID, fwd_binding); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - auto bwd_accessor = env.get_accessor(POOL2D_BWD_TASK_ID, bwd_binding); + auto bwd_accessor = env.get_bwd_accessor(POOL2D_BWD_TASK_ID, bwd_binding); float forward_time = forward_task_impl(fwd_accessor).value(); float backward_time = backward_task_impl(bwd_accessor).value(); @@ -207,7 +182,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_input_slot(INPUT); init.add_output_slot(OUTPUT); @@ -217,28 +193,32 @@ void register_task() { init.add_return_value(); - register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_taks); + register_task(POOL2D_INIT_TASK_ID, "Pool2D::init", init, init_task_impl); } template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(PER_DEVICE_STATE); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task); + register_task(POOL2D_FWD_TASK_ID, "Pool2D::forward", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(POOL2D_FWD_TASK_ID)); + +// register_task(POOL2D_BWD_TASK_ID, "Pool2D::backward", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/pool_2d.h b/lib/local-execution/src/ops/pool_2d.h similarity index 97% rename from lib/runtime/src/ops/pool_2d.h rename to lib/local-execution/src/ops/pool_2d.h index f8701f461e..852110e2e2 100644 --- a/lib/runtime/src/ops/pool_2d.h +++ b/lib/local-execution/src/ops/pool_2d.h @@ -20,7 +20,7 @@ OpTaskInvocation backward(Pool2DAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, Pool2DAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input_shape, ProfilingSettings const &settings, MachineView const &machine_view); diff --git a/lib/runtime/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc similarity index 60% rename from lib/runtime/src/ops/reduce.cc rename to lib/local-execution/src/ops/reduce.cc index 2674dc4fef..d502a2b669 100644 --- a/lib/runtime/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -1,27 +1,12 @@ #include "reduce.h" #include "kernels/reduce_kernels.h" -#include "legion/legion_utilities.h" -#include "op-attrs/get_output_shape.h" -#include "utils/exceptions.h" + +#include "op-attrs/get_output_shapes.h" +#include "utils/exception.h" #include "utils/hash-utils.h" #include "utils/type_traits_core.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reduce; @@ -54,42 +39,34 @@ static DeviceSpecific auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - OperatorType = attrs.op_type; + OperatorType op_type = attrs.op_type; // Note: How to set the reduction size? size_t reduction_size = input.shape.get_volume() / output.shape.get_volume(); DeviceSpecific per_device_state = - acc.create_device_specific(init_kernel( - handle, op_type, reduction_size, input.shape, output.shape)); + init_kernel(handle, op_type, reduction_size, input.shape, output.shape); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT) + OpTaskSignature init; + init.type = OpTaskType::INIT; - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); init.add_return_value(); - register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task); + register_task(REDUCE_INIT_TASK_ID, "Reduce::init", init, init_task_impl); } // Note: forward_kernel only needs ReducePerDeviceState, input, output OpTaskInvocation forward(ReduceAttrs const &attrs) { OpTaskBinding binding; - bind.bind_arg(PER_DEVICE_STATE, per_device_op_state()); - bind.bind_arg(PROFILING, profiling_tensor()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + binding.bind_arg(PROFILING, profiling_settings()); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -97,7 +74,7 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) { return {REDUCE_FWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -113,25 +90,18 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FORWARD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task); + register_task(REDUCE_FWD_TASK_ID, "Reduce::forward", fwd, forward_task_impl); } OpTaskInvocation backward(ReduceAttrs const &attrs) { @@ -140,48 +110,44 @@ OpTaskInvocation backward(ReduceAttrs const &attrs) { return {REDUCE_BWD_TASK_ID, binding}; } -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, "[Reduce] backward_time = %.2lfms\n", per_device_state, - input.get_float_ptr(), - output.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} +// TODO: OpTaskSignature -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REDUCE_FWD_TASK_ID)); - reister_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, backward_task); -} +// register_task(REDUCE_BWD_TASK_ID, "Reduce::backward", bwd, +// backward_task_impl); +// } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReduceAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); + init_binding.bind_arg(HANDLE, ff_handle()); auto init_accessor = env.get_init_accessor(REDUCE_INIT_TASK_ID, init_binding); DeviceSpecific per_device_state = @@ -189,10 +155,10 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SimTaskBinding fwd_binding; ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - fwd.bind(INPUT, input.shape); - fwd.bind(OUTPUT, output_shape); - fwd.bind_arg(PROFILING, settings); - fwd.bind_arg(PER_DEVICE_STATE, per_device_state); + fwd_binding.bind(INPUT, input.shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); diff --git a/lib/runtime/src/ops/reduce.h b/lib/local-execution/src/ops/reduce.h similarity index 98% rename from lib/runtime/src/ops/reduce.h rename to lib/local-execution/src/ops/reduce.h index 099083ed67..4c22a9127e 100644 --- a/lib/runtime/src/ops/reduce.h +++ b/lib/local-execution/src/ops/reduce.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #include "op-attrs/ops/reduce.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc similarity index 58% rename from lib/runtime/src/ops/reduction.cc rename to lib/local-execution/src/ops/reduction.cc index 9a11d3a6f5..31b3e2458d 100644 --- a/lib/runtime/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -13,32 +13,14 @@ * limitations under the License. */ -#include "parallel_ops/reduction.h" +#include "reduction.h" #include "kernels/reduction_kernels.h" -#include "op-attrs/get_output_shape.h" -#include "utils/exceptions.h" +#include "op-attrs/get_output_shapes.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { // declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reduction; @@ -61,7 +43,7 @@ OpTaskInvocation backward(ReductionAttrs const &attrs) { return {REDUCTION_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling_settings = acc.get_argument(PROFILING); @@ -71,40 +53,25 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { size_t num_replicas = attrs.reduction_degree; - return profiling(forward_kernel, - profiling_settings, - "[Reduction] forward_time = %.2lfms\n", - input, - output, - num_replicas); + return profile(forward_kernel, + profiling_settings, + "[Reduction] forward_time = %.2lfms\n", + input, + output, + num_replicas); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - return profiling(backward_kernel, - profiling, - "[Reduction] backward_time = %.2lfms\n", - input_grad, - output_grad); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + return profile(backward_kernel, + profiling, + "[Reduction] backward_time = %.2lfms\n", + input_grad, + output_grad); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -114,13 +81,13 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, MachineView const &machine_view) { ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding fwd_binding; fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(ATTRS, attrs); fwd_binding.bind(INPUT, input.shape); - fwd.binding.bind(OUTPUT, output_shape); + fwd_binding.bind(OUTPUT, output_shape); auto fwd_accessor = env.get_fwd_accessor(REDUCTION_FWD_TASK_ID, fwd_binding); @@ -137,7 +104,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_arg_slot(ATTRS); @@ -145,15 +113,18 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task); + register_task(REDUCTION_FWD_TASK_ID, "Reduction Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REDUCTION_FWD_TASK_ID)); + +// register_task(REDUCTION_BWD_TASK_ID, "Reduction Bwd", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reduction.h b/lib/local-execution/src/ops/reduction.h similarity index 96% rename from lib/runtime/src/ops/reduction.h rename to lib/local-execution/src/ops/reduction.h index 978ca6b080..071c4d2a7b 100644 --- a/lib/runtime/src/ops/reduction.h +++ b/lib/local-execution/src/ops/reduction.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REDUCTION_H #define _FLEXFLOW_REDUCTION_H -#include "op-attrs/ops/combine.h" +#include "op-attrs/ops/reduction.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/repartition.h b/lib/local-execution/src/ops/repartition.h similarity index 98% rename from lib/runtime/src/ops/repartition.h rename to lib/local-execution/src/ops/repartition.h index fccc0de7be..0c8cdaf0f9 100644 --- a/lib/runtime/src/ops/repartition.h +++ b/lib/local-execution/src/ops/repartition.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_PARTITION_H #include "op-attrs/ops/repartition.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc similarity index 65% rename from lib/runtime/src/ops/replicate.cc rename to lib/local-execution/src/ops/replicate.cc index 1675a62c5f..fa13766d9e 100644 --- a/lib/runtime/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -13,39 +13,21 @@ * limitations under the License. */ -#include "parallel_ops/replicate.h" +#include "replicate.h" #include "kernels/replicate_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/graph/serialparallel.h" #include "utils/hash-utils.h" #include namespace FlexFlow { // declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::LogicalPartition; -using Legion::LogicalRegion; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Replicate; -enum Slots { INPUT, OUTPUT, PROFILING }; +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; OpTaskInvocation forward(ReplicateAttrs const &attrs) { OpTaskBinding binding; @@ -54,6 +36,7 @@ OpTaskInvocation forward(ReplicateAttrs const &attrs) { binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(ATTRS, attrs); return {REPLICATE_FWD_TASK_ID, binding}; } @@ -63,7 +46,7 @@ OpTaskInvocation backward(ReplicateAttrs const &attrs) { return {REPLICATE_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -76,33 +59,20 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto const &attrs = acc.get_argument(ATTRS); return profile(backward_kernel, profiling, "[replicate] backward_time = %.2lfms\n", input_grad, - output_grad); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + output_grad, + attrs.replicate_degree); // is this `num_replicas`? } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -110,7 +80,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding fwd_binding; fwd_binding.bind_arg(PROFILING, settings); ParallelTensorShape output = get_output_shape(attrs, input.shape); @@ -130,20 +100,25 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task); + register_task(REPLICATE_FWD_TASK_ID, "Replicate fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(CAST_FWD_TASK_ID)); + +// register_task(REPLICATE_BWD_TASK_ID, "Replicate bwd", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/replicate.h b/lib/local-execution/src/ops/replicate.h similarity index 95% rename from lib/runtime/src/ops/replicate.h rename to lib/local-execution/src/ops/replicate.h index da2b71f098..510676931b 100644 --- a/lib/runtime/src/ops/replicate.h +++ b/lib/local-execution/src/ops/replicate.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_REPLICATE_H #include "op-attrs/ops/replicate.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc similarity index 68% rename from lib/runtime/src/ops/reshape.cc rename to lib/local-execution/src/ops/reshape.cc index c9dc8cff8d..2b3200d79d 100644 --- a/lib/runtime/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -15,24 +15,10 @@ #include "reshape.h" #include "kernels/reshape_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" namespace FlexFlow { // declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reshape; @@ -69,24 +55,14 @@ static DeviceSpecific auto attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(attrs.shape.data_type)); + init_kernel(attrs.shape.data_type); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - Profiling profiling = acc.get_argument(PROFILING); + ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -99,18 +75,11 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - Profiling profiling = acc.get_argument(PROFILING); + ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); @@ -123,20 +92,13 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ReshapeAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { + auto env = sim_factory.new_environment(); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); auto init_accessor = @@ -164,18 +126,20 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_arg_slot(ATTRS); - init.add_return_value(PER_DEVICE_STATE); + init.add_return_value(); - register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task); + register_task(RESHAPE_INIT_TASK_ID, "Reshape Init", init, init_task_impl); } template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); @@ -183,15 +147,17 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task); + register_task(RESHAPE_FWD_TASK_ID, "Reshape Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_binding(get_op_signature(RESHAPE_FWD_TASK_ID)); + +// register_task(RESHAPE_BWD_TASK_ID, "Reshape Bwd", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reshape.h b/lib/local-execution/src/ops/reshape.h similarity index 98% rename from lib/runtime/src/ops/reshape.h rename to lib/local-execution/src/ops/reshape.h index f044e3f057..0b845de5fc 100644 --- a/lib/runtime/src/ops/reshape.h +++ b/lib/local-execution/src/ops/reshape.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RESHAPE_H #include "op-attrs/ops/reshape.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc similarity index 69% rename from lib/runtime/src/ops/reverse.cc rename to lib/local-execution/src/ops/reverse.cc index ac64146cd1..6c28966e6e 100644 --- a/lib/runtime/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -19,23 +19,9 @@ #include "op-attrs/get_output_shapes.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Reverse; +using coord_t = long long; enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; @@ -43,7 +29,7 @@ OpTaskInvocation forward(ReverseAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -56,22 +42,22 @@ OpTaskInvocation backward(ReverseAttrs const &attrs) { return {REVERSE_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int output_size = outtput.shape.get_volume(); + int output_size = output.shape.get_volume(); auto axis = attrs.axis; coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < output.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= output.shape[i]; + in_blk_size *= output.shape.at(ff_dim_t(i)); } else if (i == axis) { - reverse_dim_size = output.shape[i]; + reverse_dim_size = output.shape.at(ff_dim_t(i)); } else { - num_out_blks *= output.shape[i]; + num_out_blks *= output.shape.at(ff_dim_t(i)); } } @@ -86,29 +72,22 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { output_size); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int axis = input.shape.get_dim() - attrs.axis - 1; + int axis = input_grad.shape.get_dim() - attrs.axis.value() - 1; coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1; for (int i = 0; i < input_grad.shape.get_dim(); i++) { if (i < axis) { - in_blk_size *= input_grad.shape[i]; + in_blk_size *= input_grad.shape.at(ff_dim_t(i)); } else if (i == axis) { - reverse_dim_size = input_grad.shape[i]; + reverse_dim_size = input_grad.shape.at(ff_dim_t(i)); } else { - num_out_blks *= input_grad.shape[i]; + num_out_blks *= input_grad.shape.at(ff_dim_t(i)); } } @@ -120,15 +99,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { num_out_blks, reverse_dim_size, in_blk_size, - input.shape.get_volume()); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + input_grad.shape.get_volume()); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -136,7 +107,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding fwd_binding; @@ -161,21 +132,24 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, } template <> -void register_task()) { - OpTaskSignature fwd(OpTaskType::FWD); +void register_task() { + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task); + register_task(REVERSE_FWD_TASK_ID, "Reverse forward", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID)); - register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, backward_task); -} +// TODO: OpTaskSignature +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(REVERSE_BWD_TASK_ID)); +// register_task(REVERSE_BWD_TASK_ID, "Reverse backward", bwd, +// backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/reverse.h b/lib/local-execution/src/ops/reverse.h similarity index 95% rename from lib/runtime/src/ops/reverse.h rename to lib/local-execution/src/ops/reverse.h index af4d335429..68545644bd 100644 --- a/lib/runtime/src/ops/reverse.h +++ b/lib/local-execution/src/ops/reverse.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_REVERSE_H_ #include "op-attrs/ops/reverse.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc similarity index 67% rename from lib/runtime/src/ops/softmax.cc rename to lib/local-execution/src/ops/softmax.cc index b67f9730a4..054b3bc7db 100644 --- a/lib/runtime/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -17,26 +17,10 @@ #include "kernels/softmax_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" -#include "utils/exceptions.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - using namespace FlexFlow::Kernels::Softmax; enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE }; @@ -75,21 +59,11 @@ static DeviceSpecific auto const &attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(handle, attrs.dim)); + init_kernel(handle, attrs.dim.value()); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -101,18 +75,11 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { "[SoftMax] forward_time = %.2lfms\n", per_device_state, input.get_float_ptr(), - output.get_float_ptr(), ); + output.get_float_ptr()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -124,22 +91,12 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { assert(output_grad.shape == output.shape); - return profile( - backward_kernel, - profiling, - "[SoftMax] backward_time = %.2lfms\n", - input_grad.get_float_ptr(), - output_grad.get_float_ptr(), - output_grad.shape.volume(), // Note(lambda): get num_elements, maybe wrong - ); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + return profile(backward_kernel, + profiling, + "[SoftMax] backward_time = %.2lfms\n", + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + output_grad.shape.get_volume()); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -147,7 +104,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); @@ -162,7 +119,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, init_task_impl(init_accessor); SimTaskBinding fwd_binding; - ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); fwd_binding.bind(INPUT, input.shape); fwd_binding.bind(OUTPUT, output_shape); fwd_binding.bind_arg(PROFILING, settings); @@ -182,18 +138,20 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); - init.add_return_value_slot(); + init.add_return_value(); - register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task); + register_task(SOFTMAX_INIT_TASK_ID, "SoftMax Init", init, init_task_impl); } template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); @@ -201,15 +159,17 @@ void register_task() { fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task); + register_task(SOFTMAX_FWD_TASK_ID, "SoftMax Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(SOFTMAX_FWD_TASK_ID)); + +// register_task(SOFTMAX_BWD_TASK_ID, "SoftMax Bwd", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/softmax.h b/lib/local-execution/src/ops/softmax.h similarity index 98% rename from lib/runtime/src/ops/softmax.h rename to lib/local-execution/src/ops/softmax.h index 06b9d09d60..8fe2f96eb5 100644 --- a/lib/runtime/src/ops/softmax.h +++ b/lib/local-execution/src/ops/softmax.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_SOFTMAX_H #include "op-attrs/ops/softmax.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/split.cc b/lib/local-execution/src/ops/split.cc similarity index 67% rename from lib/runtime/src/ops/split.cc rename to lib/local-execution/src/ops/split.cc index 2af5d42874..3661d6e074 100644 --- a/lib/runtime/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -16,28 +16,14 @@ #include "split.h" #include "kernels/array_shape.h" #include "kernels/split_kernels.h" -#include "utils/exceptions.h" +#include "op-attrs/get_output_shapes.h" +#include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; using namespace FlexFlow::Kernels::Split; +using coord_t = long long; enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; @@ -58,24 +44,40 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { return {SPLIT_BWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +void calc_block_size(coord_t &num_blks, + coord_t &blk_size, + ArrayShape const &array_shape, + int axis) { + num_blks = 1; + blk_size = 1; + for (int d = 0; d < array_shape.num_elements(); d++) { + if (d <= axis) { + blk_size *= array_shape.at(legion_dim_t(d)); + } else { + num_blks *= array_shape.at(legion_dim_t(d)); + } + } +} + +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis); + calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis.value()); for (int i = 0; i < attrs.splits.size(); i++) { coord_t out_num_blks; calc_block_size( - out_num_blks, out_blk_size[i], output.shape, split->legion_axis); + out_num_blks, out_blk_size[i], output.shape, attrs.axis.value()); } + float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, profiling, "Split forward_time = %.2lfms\n", - &output.get_float_ptr(), + &output_float_ptr, input.get_float_ptr(), out_blk_size, in_blk_size, @@ -83,71 +85,42 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { attrs.splits.size()); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - // maybe we should add assert like the original code -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, input_grade.shape, attrs.axis); + calc_block_size(num_blks, in_blk_size, input_grad.shape, attrs.axis.value()); for (int i = 0; i < attrs.splits.size(); i++) { coord_t out_num_blks; calc_block_size( - out_num_blks, out_blk_size[i], output_grad.shape, split->legion_axis); + out_num_blks, out_blk_size[i], output_grad.shape, attrs.axis.value()); } + float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, profiling, "Split backward_time = %.2lfms\n", input_grad.get_float_ptr(), - &output_grad.get_float_ptr(), + &output_grad_ptr, out_blk_size, in_blk_size, num_blks, attrs.splits.size()); } -void calc_block_size(coord_t &num_blks, - coord_t &blk_size, - ArrayShape const &array_shape, - int axis) { - num_blks = 1; - blk_size = 1; - for (int d = 0; d < array_shape.get_dim(); d++) { - if (d <= axis) { - blk_size *= (domain.hi()[d] - domain.lo()[d] + 1); - blk_size *= array_shape.at(legion_dim_t(d)) + 1 - } else { - num_blks *= array_shape.at(legion_dim_t(d)) + 1 - } - } -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); - ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); + std::vector output_shape = + get_output_shapes(attrs, input.shape); SimTaskBinding fwd_binding; fwd_binding.bind(INPUT, input.shape); @@ -166,23 +139,26 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, return make_metrics(forward_time, backward_time, sync_time, env); } +// TODO: OpTaskSignature + template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); - register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task); + register_task(SPLIT_FWD_TASK_ID, "Split Fwd", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = - infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(SPLIT_FWD_TASK_ID)); - register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task); -} +// register_task(SPLIT_BWD_TASK_ID, "Split Bwd", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/split.h b/lib/local-execution/src/ops/split.h similarity index 95% rename from lib/runtime/src/ops/split.h rename to lib/local-execution/src/ops/split.h index d63212e836..1fdfdc2432 100644 --- a/lib/runtime/src/ops/split.h +++ b/lib/local-execution/src/ops/split.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_SPLIT_H #include "op-attrs/ops/split.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -20,7 +20,7 @@ OpTaskInvocation backward(SplitAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, SplitAttrs const &attrs, - InputParallelTensorDes const &input, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view); diff --git a/lib/runtime/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc similarity index 59% rename from lib/runtime/src/ops/topk.cc rename to lib/local-execution/src/ops/topk.cc index 958516a6d9..5fb2c6842f 100644 --- a/lib/runtime/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -16,28 +16,9 @@ #include "topk.h" #include "kernels/topk_kernels.h" #include "op-attrs/get_output_shapes.h" -#include "utils/exceptions.h" +#include "utils/exception.h" namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; -using Legion::Machine; -using Legion::Memory; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; using namespace FlexFlow::Kernels::TopK; @@ -50,7 +31,7 @@ enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE }; OpTaskInvocation init(TopKAttrs const &attrs) { OpTaskBinding binding; - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); return {TOPK_INIT_TASK_ID, binding}; } @@ -60,7 +41,7 @@ OpTaskInvocation forward(TopKAttrs const &attrs) { binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); - bind.bind_arg(ATTRS, attrs); + binding.bind_arg(ATTRS, attrs); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); @@ -81,23 +62,14 @@ static DeviceSpecific auto attrs = acc.get_argument(ATTRS); DeviceSpecific per_device_state = - acc.create_device_specific(init_kernel(attrs.sorted)); + init_kernel(attrs.sorted); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} - -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto per_device_state = - acc.get_device_specific(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); auto profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); @@ -107,31 +79,24 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { size_t batch_size = input.shape.get_volume() / length; auto indices = acc.get_tensor(INDICES); - return profiling(forward_kernel, - profiling, - "[TopK] forward_time = %.2lfms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - indices.get_int32_ptr(), - batch_size, - length, - attrs.k, - attrs.sorted); + return profile(forward_kernel, + profiling, + "[TopK] forward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + output.get_float_ptr(), + indices.get_int32_ptr(), + batch_size, + length, + attrs.k, + attrs.sorted); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto per_device_state = - acc.get_device_specific(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); auto profiling = acc.get_argument(PROFILING); auto input_grad = acc.get_tensor_grad(INPUT); @@ -139,27 +104,19 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { auto indices = acc.get_tensor(INDICES); - int length = input.shape.at(legion_dim_t(0)) + 1; - size_t batch_size = input.shape.get_volume() / length; - - return profiling(backward_kernel, - profiling, - "[TopK] backward_time = %.2lfms\n", - per_device_state, - output_grad.get_float_ptr(), - indices.get_int32_ptr(), - input_grad.get_float_ptr(), - batch_size, - length, - attrs.k); -} - -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); + int length = input_grad.shape.at(legion_dim_t(0)) + 1; + size_t batch_size = input_grad.shape.get_volume() / length; + + return profile(backward_kernel, + profiling, + "[TopK] backward_time = %.2lfms\n", + per_device_state, + output_grad.get_float_ptr(), + indices.get_int32_ptr(), + input_grad.get_float_ptr(), + batch_size, + length, + attrs.k); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -167,9 +124,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); - ParallelTensorShape output_shape = get_output_shapes(attrs, input.shape); + ParallelTensorShape output_shape = get_output_shape(attrs, input.shape); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); @@ -200,16 +157,18 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init(OpTaskType::INIT); + OpTaskSignature init; + init.type = OpTaskType::INIT; init.add_arg_slot(ATTRS); // Note: this may have some question init.add_return_value(); - register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task); + register_task(TOPK_INIT_TASK_ID, "Topk Init", init, init_task_impl); } template <> void register_task() { - OpTaskSignature fwd(OpTaskType::FWD); + OpTaskSignature fwd; + fwd.type = OpTaskType::FWD; fwd.add_arg_slot(PROFILING); fwd.add_arg_slot(ATTRS); @@ -219,14 +178,17 @@ void register_task() { fwd.add_output_slot(OUTPUT); fwd.add_output_slot(INDICES); - register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task); + register_task(TOPK_FWD_TASK_ID, "TopK Forward", fwd, forward_task_impl); } -template <> -void register_task() { - OpTaskSignature bwd = infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID)); +// TODO: OpTaskSignature - register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task); -} +// template <> +// void register_task() { +// OpTaskSignature bwd = +// infer_bwd_signature(get_op_signature(TOPK_FWD_TASK_ID)); + +// register_task(TOPK_BWD_TASK_ID, "TopK Backward", bwd, backward_task_impl); +// } }; // namespace FlexFlow diff --git a/lib/runtime/src/ops/topk.h b/lib/local-execution/src/ops/topk.h similarity index 98% rename from lib/runtime/src/ops/topk.h rename to lib/local-execution/src/ops/topk.h index f15ff6de81..fcab2a5a31 100644 --- a/lib/runtime/src/ops/topk.h +++ b/lib/local-execution/src/ops/topk.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_TOPK_H_ #include "op-attrs/ops/topk.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/runtime/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc similarity index 55% rename from lib/runtime/src/ops/transpose.cc rename to lib/local-execution/src/ops/transpose.cc index ea6182772f..f580a46792 100644 --- a/lib/runtime/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -15,27 +15,10 @@ #include "transpose.h" #include "kernels/transpose_kernels.h" -#include "legion/legion_utilities.h" +#include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/transpose.h" #include "utils/exception.decl.h" -namespace FlexFlow { -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; - using namespace FlexFlow::Kernels::Transpose; namespace FlexFlow { @@ -57,33 +40,26 @@ OpTaskInvocation init(TransposeAttrs const &attrs) { static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); - std::vector perm = attrs.perm; // default convert stack_vector to vector + std::vector perm = static_cast>(attrs.perm); DeviceSpecific per_device_state = - acc.create_device_specific( - init_kernel(perm.size(), perm)); + init_kernel(perm.size(), perm); return per_device_state; } -static DeviceSpecific - init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - return init_task_impl(acc); -} +// TODO: OpTaskSignature -template <> -void register_task(); -OpTaskSignature init(OpTaskType::INIT) +// template <> +// void register_task() { +// OpTaskSignature init(OpTaskType::INIT); - init.add_arg_slot(ATTRS); +// init.add_arg_slot(ATTRS); -init.add_return_value(); +// init.add_return_value(); -register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init, init_task); -} // namespace FlexFlow +// register_task(TRANSPOSE_INIT_TASK_ID, "Transpose::init", init, +// init_task_impl); +// } OpTaskInvocation forward(TransposeAttrs const &attrs) { OpTaskBinding binding; @@ -92,13 +68,13 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) { per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); - bind.bind(INPUT, input_tensor(0)); - bind.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); return {TRANSPOSE_FWD_TASK_ID, binding}; } -static optional forward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -106,47 +82,32 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - return profiling(forward_kernel, - profiling, - "[Transpose] Forward_time = %.2lf [ms]", - per_device_state, - input, - output); + return profile(forward_kernel, + profiling, + "[Transpose] Forward_time = %.2lf [ms]", + per_device_state, + input, + output); } -static void forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - forward_task_impl(acc); -} - -static optional backward_task_impl(TaskArgumentAccessor const &acc) { +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = - acc.get_per_device_state(PER_DEVICE_STATE); + acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); - return profiling(backward_kernel, - profiling, - "[Transpose] Backward_time = %.2lf [ms]", - per_device_state, - input_grad, - output_grad); + return profile(backward_kernel, + profiling, + "[Transpose] Backward_time = %.2lf [ms]", + per_device_state, + input_grad, + output_grad); } -static void backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - backward_task_impl(acc); -} - -OpTaskInvocation backward(TransposeAttrs const &) { +OpTaskInvocation backward(TransposeAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); return {TRANSPOSE_BWD_TASK_ID, binding}; @@ -159,7 +120,7 @@ CostMetrics &input_descs, // Note:this may have some problem ProfilingSettings const &settings, MachineView const &machine_view) { - auto env = sim.new_environment(); + auto env = sim_factory.new_environment(); SimTaskBinding init_binding; init_binding.bind_arg(ATTRS, attrs); @@ -169,12 +130,13 @@ CostMetrics DeviceSpecific per_device_state = init_task_impl(init_accessor); - ParallelTensorShape output_shape = get_output_shape(attrs, input_descs.shape); + ParallelTensorShape output_shape = + get_output_shape(attrs, input_descs.shapes); SimTaskBinding fwd_binding; fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); fwd_binding.bind_arg(PROFILING, settings); - fwd_binding.bind(INPUT, input_descs.shape); + fwd_binding.bind(INPUT, input_descs.shapes); fwd_binding.bind(OUTPUT, output_shape); auto fwd_accessor = env.get_fwd_accessor(TRANSPOSE_FWD_TASK_ID, fwd_binding); @@ -189,4 +151,4 @@ CostMetrics return make_metrics(forward_time, backward_time, sync_time, env); } -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/lib/runtime/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h similarity index 98% rename from lib/runtime/src/ops/transpose.h rename to lib/local-execution/src/ops/transpose.h index 52e824ebbf..6c6dffdc8a 100644 --- a/lib/runtime/src/ops/transpose.h +++ b/lib/local-execution/src/ops/transpose.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_TRANSPOSE_H_ #include "op-attrs/ops/transpose.h" +#include "op_task_invocation.h" #include "sim_environment.h" -#include "task_spec/op_task_invocation.h" namespace FlexFlow { From 931b47c4bf710c85d3b9fb6cc09e8a3e6a7d1306 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 14 May 2024 07:00:01 -0700 Subject: [PATCH 04/24] Format --- lib/local-execution/include/tracked_allocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h index 64cc31e858..766411357a 100644 --- a/lib/local-execution/include/tracked_allocator.h +++ b/lib/local-execution/include/tracked_allocator.h @@ -5,7 +5,7 @@ namespace FlexFlow { -struct TrackedAllocator: public Allocator { +struct TrackedAllocator : public Allocator { Allocator() = delete; void *allocate(size_t mem_size); From 8a66ed93a52ecfc0212ebd8ee71fda105a1a64af Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 14 May 2024 07:01:33 -0700 Subject: [PATCH 05/24] Format --- lib/local-execution/include/tracked_allocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h index 64cc31e858..766411357a 100644 --- a/lib/local-execution/include/tracked_allocator.h +++ b/lib/local-execution/include/tracked_allocator.h @@ -5,7 +5,7 @@ namespace FlexFlow { -struct TrackedAllocator: public Allocator { +struct TrackedAllocator : public Allocator { Allocator() = delete; void *allocate(size_t mem_size); From 3ffe2392271220f0b22b11e820ca01cf6c094b33 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 21 May 2024 15:58:07 -0700 Subject: [PATCH 06/24] Fix tracked allocator --- .../include/tracked_allocator.h | 14 ++++++++++---- lib/local-execution/src/tracked_allocator.cc | 19 ++++++++++--------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/tracked_allocator.h index 766411357a..510716b3af 100644 --- a/lib/local-execution/include/tracked_allocator.h +++ b/lib/local-execution/include/tracked_allocator.h @@ -5,16 +5,22 @@ namespace FlexFlow { -struct TrackedAllocator : public Allocator { - Allocator() = delete; +struct TrackedAllocator : public IAllocator { + TrackedAllocator() = default; + TrackedAllocator(TrackedAllocator const &) = delete; + TrackedAllocator(TrackedAllocator &&) = delete; + ~TrackedAllocator() = default; - void *allocate(size_t mem_size); - void deallocate(void *ptr); + void *allocate(size_t) override; + void deallocate(void *) override; size_t get_current_mem_usage(); private: size_t current_mem_usage; }; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(TrackedAllocator); + +Allocator get_tracked_memory_allocator(); } // namespace FlexFlow diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 6e666b647c..a3ccdc02b5 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -3,25 +3,26 @@ namespace FlexFlow { -void *TrackedAllocator::allocate(size_t mem_size) { - void *ptr = this->i_allocator->allocate(mem_size); - this->curr_mem_usage += mem_size; +void *TrackedAllocator::allocate(size_t requested_memory_size) { + void *ptr; + checkCUDA(cudaMalloc(&ptr, requested_memory_size)); + this->current_mem_usage += requested_memory_size; return ptr; } void TrackedAllocator::deallocate(void *ptr) { size_t psize; - checkCUDA(cuMemGetAddressRange(nullptr, &psize, ptr)); - this->i_allocator->deallocate(ptr); - this->curr_mem_usage -= psize; + checkCUDA(cudaGetSymbolSize(&psize, ptr)); + checkCUDA(cudaFree(ptr)); + this->current_mem_usage -= psize; } size_t TrackedAllocator::get_current_mem_usage() { - return this->curr_mem_usage; + return this->current_mem_usage; } -TrackedAllocator get_tracked_local_allocator() { - return Allocator::create(); +Allocator get_tracked_memory_allocator() { + return Allocator::create(); } } // namespace FlexFlow From da10906c6edd8051cdc0455fca3f62e82863affb Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 21 May 2024 17:08:58 -0700 Subject: [PATCH 07/24] Fix comp graph --- lib/op-attrs/include/op-attrs/ops/linear.h | 4 + lib/pcg/include/pcg/computation_graph.h | 100 ++++-------------- .../include/pcg/computation_graph_builder.h | 9 +- lib/pcg/include/pcg/tensor.h | 2 + lib/pcg/src/computation_graph.cc | 79 ++++++++++++++ lib/pcg/src/computation_graph_builder.cc | 98 ++++++++++------- lib/pcg/src/tensor.cc | 4 + 7 files changed, 173 insertions(+), 123 deletions(-) create mode 100644 lib/pcg/src/computation_graph.cc diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h index 0eb7ccec45..a46df59282 100644 --- a/lib/op-attrs/include/op-attrs/ops/linear.h +++ b/lib/op-attrs/include/op-attrs/ops/linear.h @@ -34,6 +34,10 @@ FF_VISITABLE_STRUCT( LinearAttrs, out_channels, use_bias, data_type, activation, regularizer); CHECK_VALID_OP_ATTR(LinearAttrs); +TensorShape get_weights_shape(LinearAttrs const &attrs, + TensorShape const &input); +TensorShape get_bias_shape(LinearAttrs const &attrs, TensorShape const &input); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 4d4fa86efa..53aa7eb820 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -17,82 +17,6 @@ struct ComputationGraph OutputLabelledMultiDiGraph> { using strong_typedef::strong_typedef; - std::vector traverse() { - std::vector layers = get_topological_ordering(this->value()); - return transform(layers, [&](Node const &e) -> operator_guid_t { - return operator_guid_t{e}; - }); - } - - std::vector traverse_reverse_order() { - std::vector layers = - reversed>(get_topological_ordering(this->value())); - return transform(layers, [&](Node const &e) -> operator_guid_t { - return operator_guid_t{e}; - }); - } - - bool out_edge_comparator(MultiDiOutput x, MultiDiOutput y) { - return x.src_idx < y.src_idx; - } - - std::vector - sort_edge_set(std::unordered_set edges) { - std::unordered_set outputs = - transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput { - return MultiDiOutput(e); - }); - std::vector sorted_outputs(outputs.begin(), outputs.end()); - sort(sorted_outputs.begin(), sorted_outputs.end(), out_edge_comparator); - return transform(sorted_outputs, - [&](MultiDiOutput const &e) -> tensor_guid_t { - return tensor_guid_t{e}; - }); - } - - std::vector get_outgoing_tensors(operator_guid_t n) { - return sort_edge_set(get_outgoing_edges(this->value(), n.value())); - } - - std::vector get_incoming_tensors(operator_guid_t n) { - return sort_edge_set(get_incoming_edges(this->value(), n.value())); - } - - operator_guid_t add_node(Layer const &layer) { - Node added_node = this->value().add_node(layer); - return operator_guid_t{added_node}; - } - - void add_output(tensor_guid_t const &output, Tensor const &tensor) { - this->value().add_output(output.value(), tensor); - } - - tensor_guid_t create_outgoing_edge(operator_guid_t node, int idx) { - MultiDiOutput edge = {node.value(), NodePort{idx}}; - return tensor_guid_t{edge}; - } - - tensor_guid_t create_outgoing_edge_with_label(operator_guid_t node, - int idx, - Tensor tensor) { - tensor_guid_t tensor_guid = create_outgoing_edge(node, idx); - add_output(tensor_guid, tensor); - return tensor_guid; - } - - void add_incoming_edges(std::vector const &incoming_edges, - operator_guid_t node) { - size_t incoming_edge_dst_port = 0; - for (tensor_guid_t input : incoming_edges) { - MultiDiOutput input_view = input.value(); - MultiDiEdge edge = {node.value(), - NodePort{incoming_edge_dst_port++}, - input_view.src, - input_view.src_idx}; - this->value().add_edge(edge); - } - } - Layer &at(operator_guid_t const &n) { return this->value().at(n.value()); } @@ -108,13 +32,29 @@ struct ComputationGraph Tensor const &at(tensor_guid_t const &e) const { return this->value().at(e.value()); } - - CompGraphOperatorAttrs get_layer_attrs(operator_guid_t const &n) const { - return this->at(n).attrs; - } }; CHECK_WELL_BEHAVED_VALUE_TYPE_NO_HASH(ComputationGraph); +std::vector + traverse_comp_graph(ComputationGraph const &comp_graph); +std::vector + traverse_comp_graph_backwards(ComputationGraph const &comp_graph); +std::vector + get_outgoing_tensors(ComputationGraph const &comp_graph, operator_guid_t n); +std::vector + get_incoming_tensors(ComputationGraph const &comp_graph, operator_guid_t n); +operator_guid_t add_node(ComputationGraph &comp_graph, Layer const &layer); +tensor_guid_t create_outgoing_edge_with_label(ComputationGraph &comp_graph, + operator_guid_t node, + int idx, + Tensor tensor); + +void add_incoming_edges(ComputationGraph &comp_graph, + std::vector const &incoming_edges, + operator_guid_t node); +CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph, + operator_guid_t const &n); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index 1be8d7ad0e..7ba95d701b 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -250,18 +250,19 @@ struct ComputationGraphBuilder std::vector const &inputs, std::vector>> const &weight_shapes, - Tensor const &output); + TensorShape const &output_shape); std::vector add_layer( Layer const &layer, std::vector const &inputs, std::vector>> const &weight_shapes, - std::vector const &outputs); + std::vector const &output_shapes); tensor_guid_t as_type(tensor_guid_t const &, DataType, std::string const &); TensorShape get_broadcast_target_shape(std::vector const &); - + TensorShape get_shape(tensor_guid_t const &t); + std::vector get_shapes(std::vector const &t); tensor_guid_t element_binary(OperatorType, tensor_guid_t const &lhs, @@ -286,8 +287,6 @@ struct ComputationGraphBuilder tensor_guid_t const &x, std::optional const &maybe_name); - std::unordered_map pre_edge_mapping; - public: ComputationGraph computation_graph; }; diff --git a/lib/pcg/include/pcg/tensor.h b/lib/pcg/include/pcg/tensor.h index 975a69809d..b5ff857a6c 100644 --- a/lib/pcg/include/pcg/tensor.h +++ b/lib/pcg/include/pcg/tensor.h @@ -33,6 +33,8 @@ FF_VISITABLE_STRUCT( using Parameter = Tensor; +Tensor construct_tensor_from_output_shape(TensorShape const &); + } // namespace FlexFlow #endif diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc new file mode 100644 index 0000000000..d8a57311bf --- /dev/null +++ b/lib/pcg/src/computation_graph.cc @@ -0,0 +1,79 @@ +#include "pcg/computation_graph.h" + +namespace FlexFlow { + +std::vector traverse_comp_graph(ComputationGraph const & comp_graph) { + std::vector layers = get_topological_ordering(comp_graph.value()); + return transform(layers, [&](Node const &e) -> operator_guid_t { + return operator_guid_t{e}; + }); +} + +std::vector traverse_comp_graph_backwards(ComputationGraph const & comp_graph) { + std::vector layers = + reversed>(get_topological_ordering(comp_graph.value())); + return transform(layers, [&](Node const &e) -> operator_guid_t { + return operator_guid_t{e}; + }); +} + +bool src_edge_comparator(MultiDiOutput x, MultiDiOutput y) { + return x.src_idx < y.src_idx; +} + +std::vector + sort_edge_set(std::unordered_set edges) { + std::unordered_set outputs = + transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput { + return MultiDiOutput(e); + }); + std::vector sorted_outputs(outputs.begin(), outputs.end()); + sort(sorted_outputs.begin(), sorted_outputs.end(), src_edge_comparator); + return transform(sorted_outputs, + [&](MultiDiOutput const &e) -> tensor_guid_t { + return tensor_guid_t{e}; + }); +} + +std::vector get_outgoing_tensors(ComputationGraph const & comp_graph, + operator_guid_t n) { + return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value())); +} + +std::vector get_incoming_tensors(ComputationGraph const & comp_graph, operator_guid_t n) { + return sort_edge_set(get_incoming_edges(comp_graph.value(), n.value())); +} + +operator_guid_t add_node(ComputationGraph & comp_graph, Layer const &layer) { + Node added_node = comp_graph.value().add_node(layer); + return operator_guid_t{added_node}; +} + +tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph, + operator_guid_t node, + int idx, + Tensor tensor) { + MultiDiOutput edge = {node.value(), NodePort{idx}}; + comp_graph.value().add_output(edge, tensor); + return tensor_guid_t{edge}; +} + +void add_incoming_edges(ComputationGraph & comp_graph, +std::vector const &incoming_edges, + operator_guid_t node) { + size_t incoming_edge_dst_port = 0; + for (tensor_guid_t input : incoming_edges) { + MultiDiOutput input_view = input.value(); + MultiDiEdge edge = {node.value(), + NodePort{incoming_edge_dst_port++}, + input_view.src, + input_view.src_idx}; + comp_graph.value().add_edge(edge); + } +} + +CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const & comp_graph, operator_guid_t const &n) { + return comp_graph.at(n).attrs; +} + +} \ No newline at end of file diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc index f308a4b242..78e49f0695 100644 --- a/lib/pcg/src/computation_graph_builder.cc +++ b/lib/pcg/src/computation_graph_builder.cc @@ -11,11 +11,14 @@ tensor_guid_t ComputationGraphBuilder::add_layer( std::vector const &inputs, std::vector>> const &weight_shapes, - Tensor const &output) { - operator_guid_t node = computation_graph.add_node(layer); - this->computation_graph.add_incoming_edges(inputs, node); - return this->computation_graph.create_outgoing_edge_with_label( - node, 0, output); + TensorShape const &output_shape) { + operator_guid_t node = add_node(computation_graph, layer); + add_incoming_edges(computation_graph, inputs, node); + return create_outgoing_edge_with_label( + computation_graph, + node, + 0, + construct_tensor_from_output_shape(output_shape)); } std::vector ComputationGraphBuilder::add_layer( @@ -23,14 +26,16 @@ std::vector ComputationGraphBuilder::add_layer( std::vector const &inputs, std::vector>> const &weight_shapes, - std::vector const &outputs) { - operator_guid_t node = computation_graph.add_node(layer); - this->computation_graph.add_incoming_edges(inputs, node); + std::vector const &output_shapes) { + operator_guid_t node = add_node(computation_graph, layer); + add_incoming_edges(computation_graph, inputs, node); std::vector output_tensor_guids; - for (int i = 0; i < outputs.size(); ++i) { - output_tensor_guids.push_back( - this->computation_graph.create_outgoing_edge_with_label( - node, i, outputs[i])); + for (int i = 0; i < output_shapes.size(); ++i) { + output_tensor_guids.push_back(create_outgoing_edge_with_label( + computation_graph, + node, + i, + construct_tensor_from_output_shape(output_shapes[i]))); } return output_tensor_guids; } @@ -48,12 +53,13 @@ tensor_guid_t tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x, DataType data_type, std::string const &name) { - if (x.data_type < data_type) { + Tensor tensor = computation_graph.at(x); + if (tensor.data_type < data_type) { return this->cast(x, data_type, name); - } else if (x.data_type > data_type) { + } else if (tensor.data_type > data_type) { throw mk_runtime_error("Could not convert provided tensor data type {} to " "desired data type {}", - x.data_type, + tensor.data_type, data_type); } return x; @@ -82,7 +88,8 @@ tensor_guid_t ComputationGraphBuilder::element_unary( this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); Layer layer = {attrs, name}; - TensorShape output_shape = get_output_shape(attrs, input); + TensorShape output_shape = + get_output_shape(attrs, computation_graph.at(input)); return this->add_layer(layer, {input}, {}, output_shape); } @@ -97,7 +104,8 @@ tensor_guid_t ComputationGraphBuilder::element_scalar_unary( this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); Layer layer = {attrs, name}; - TensorShape output_shape = get_output_shape(attrs, input); + TensorShape output_shape = + get_output_shape(attrs, computation_graph.at(input)); return this->add_layer(layer, {input}, {}, output_shape); } @@ -126,8 +134,12 @@ tensor_guid_t ComputationGraphBuilder::element_binary( std::optional const &maybe_name) { std::string name = maybe_name.value_or(get_default_name(op_type)); - TensorShape compute_shape = this->get_broadcast_target_shape({lhs, rhs}); - DataType compute_type = std::max(lhs.data_type, rhs.data_type); + Tensor lhs_tensor = computation_graph.at(lhs); + Tensor rhs_tensor = computation_graph.at(rhs); + + TensorShape compute_shape = + this->get_broadcast_target_shape({lhs_tensor, rhs_tensor}); + DataType compute_type = std::max(lhs_tensor.data_type, rhs_tensor.data_type); tensor_guid_t const lhs_input = this->as_type(this->broadcast(lhs, compute_shape), @@ -141,7 +153,8 @@ tensor_guid_t ComputationGraphBuilder::element_binary( ElementBinaryAttrs attrs = {op_type, compute_type, false, false}; Layer layer = {attrs, name}; - TensorShape output_shape = get_output_shape(attrs, lhs_input, rhs_input); + TensorShape output_shape = get_output_shape( + attrs, computation_graph.at(lhs_input), computation_graph.at(rhs_input)); return this->add_layer(layer, {lhs_input, rhs_input}, {}, output_shape); } @@ -162,18 +175,20 @@ tensor_guid_t ComputationGraphBuilder::dense( tensor_guid_t input_recast = this->as_type(input, data_type, unwrapped_name + "input_recast"); + Tensor input_recast_tensor = computation_graph.at(input_recast); Layer layer = {attrs, name}; - TensorShape output_shape = get_output_shape(attrs, input_recast); + TensorShape output_shape = get_output_shape(attrs, input_recast_tensor); Tensor output = { output_shape.dims, data_type, std::nullopt, false, std::nullopt}; std::vector>> weights; weights.push_back( - {get_weights_shape(attrs, input_recast), kernel_initializer}); + {get_weights_shape(attrs, input_recast_tensor), kernel_initializer}); if (use_bias) { - weights.push_back({get_bias_shape(attrs, input_recast), bias_initializer}); + weights.push_back( + {get_bias_shape(attrs, input_recast_tensor), bias_initializer}); } return this->add_layer(layer, {input_recast}, weights, output); @@ -348,15 +363,18 @@ tensor_guid_t ComputationGraphBuilder::conv2d( tensor_guid_t input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); + Tensor input_tensor = computation_graph.at(input); + Layer layer = {attrs, name}; - TensorShape output_shape = get_output_shape(attrs, input); + TensorShape output_shape = get_output_shape(attrs, input_tensor); std::vector>> weights; - weights.push_back({get_kernel_shape(attrs, input), kernel_initializer}); + weights.push_back( + {get_kernel_shape(attrs, input_tensor), kernel_initializer}); if (use_bias) { - weights.push_back({get_bias_shape(attrs, input), bias_initializer}); + weights.push_back({get_bias_shape(attrs, input_tensor), bias_initializer}); } return this->add_layer(layer, {input}, weights, output_shape); @@ -374,7 +392,8 @@ tensor_guid_t ComputationGraphBuilder::dropout( tensor_guid_t input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); - TensorShape output_shape = get_output_shape(attrs, input); + TensorShape output_shape = + get_output_shape(attrs, computation_graph.at(input)); return this->add_layer(layer, {input}, {}, output_shape); } @@ -394,8 +413,9 @@ tensor_guid_t ComputationGraphBuilder::embedding( tensor_guid_t input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast"); - TensorShape output_shape = get_output_shape(attrs, input); - TensorShape weights_shape = get_weights_shape(attrs, input); + Tensor input_tensor = computation_graph.at(input); + TensorShape output_shape = get_output_shape(attrs, input_tensor); + TensorShape weights_shape = get_weights_shape(attrs, input_tensor); return this->add_layer( layer, {input}, {{weights_shape, kernel_initializer}}, output_shape); @@ -410,16 +430,17 @@ std::vector ComputationGraphBuilder::gather( std::string name = maybe_name.value_or(get_default_name(attrs)); Layer layer = {attrs, name}; - if (index.data_type != DataType::INT32 && - index.data_type != DataType::INT64) { + Tensor index_tensor = computation_graph.at(index); + if (index_tensor.data_type != DataType::INT32 && + index_tensor.data_type != DataType::INT64) { throw mk_runtime_error("Invalid data type for input tensor 2 for Gather: " "{} (should be {} or {})", - input.data_type, + index_tensor.data_type, DataType::INT32, DataType::INT64); } std::vector output_shapes = - get_output_shapes(attrs, input, index); + get_output_shapes(attrs, computation_graph.at(input), index_tensor); return this->add_layer(layer, {input}, {}, output_shapes); } @@ -428,17 +449,18 @@ tensor_guid_t ComputationGraphBuilder::input(Tensor const &input_tensor, std::optional const &name) { InputAttrs input_attrs = {}; - std::string name = name.value_or(get_default_name(input_attrs)); + std::string str_name = name.value_or(get_default_name(input_attrs)); - Layer layer = {attrs, name}; + Layer layer = {input_attrs, str_name}; return this->add_layer(layer, {}, {}, input_tensor); } -TensorShape get_shape(tensor_guid_t const &t) { - return this->computation_graph.at(t).get_shape(); +TensorShape ComputationGraphBuilder::get_shape(tensor_guid_t const &t) { + return computation_graph.at(t).get_shape(); } -std::vector get_shape(std::vector const &) { +std::vector + ComputationGraphBuilder::get_shapes(std::vector const &) { NOT_IMPLEMENTED(); } diff --git a/lib/pcg/src/tensor.cc b/lib/pcg/src/tensor.cc index a5aa4b0d0c..df29ee0065 100644 --- a/lib/pcg/src/tensor.cc +++ b/lib/pcg/src/tensor.cc @@ -10,4 +10,8 @@ TensorShape Tensor::get_shape() const { return TensorShape(*this); } +Tensor construct_tensor_from_output_shape(TensorShape const &shape) { + return Tensor{shape.dims, shape.data_type, std::nullopt, false, std::nullopt}; +} + } // namespace FlexFlow From 784742c921dc3bcc9216f9513789319a7cd958e4 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 21 May 2024 19:09:43 -0700 Subject: [PATCH 08/24] Add task spec --- .../include}/arg_ref.h | 32 +-- lib/local-execution/include/concrete_arg.h | 55 +++++ .../include}/config.h | 24 ++- .../include}/cost_metrics.h | 4 +- .../include}/device_specific.h | 15 +- lib/local-execution/include/local_allocator.h | 4 +- .../include}/op_arg_ref.h | 4 +- .../include}/op_task_invocation.h | 105 +++++----- .../include}/op_task_signature.h | 64 +++--- .../include}/op_tensor_spec.h | 7 +- .../include}/permissions.h | 9 +- .../include}/profiling.h | 17 +- .../include}/runtime_arg_ref.h | 16 +- .../include}/serialization.h | 130 +----------- .../include}/sim_environment.h | 9 +- .../include}/slot_id.h | 4 +- .../include}/slot_type.h | 4 +- .../include/task_argument_accessor.h | 155 ++++++++++++++ .../src => local-execution/include}/tasks.h | 11 +- .../include}/variadic_tensor_ref.h | 4 +- .../src}/op_task_invocation.cc | 6 - lib/local-execution/src/op_task_signature.cc | 81 ++++++++ .../src/permissions.cc | 30 --- .../include/runtime/task_spec/concrete_arg.h | 46 ----- .../src/task_spec/task_argument_accessor.h | 193 ------------------ 25 files changed, 482 insertions(+), 547 deletions(-) rename lib/{runtime/src/task_spec => local-execution/include}/arg_ref.h (52%) create mode 100644 lib/local-execution/include/concrete_arg.h rename lib/{runtime/include/runtime => local-execution/include}/config.h (89%) rename lib/{runtime/src => local-execution/include}/cost_metrics.h (95%) rename lib/{runtime/src/task_spec => local-execution/include}/device_specific.h (67%) rename lib/{runtime/src/task_spec => local-execution/include}/op_arg_ref.h (84%) rename lib/{runtime/src/task_spec => local-execution/include}/op_task_invocation.h (50%) rename lib/{runtime/src/task_spec => local-execution/include}/op_task_signature.h (71%) rename lib/{runtime/src/task_spec => local-execution/include}/op_tensor_spec.h (55%) rename lib/{runtime/src => local-execution/include}/permissions.h (84%) rename lib/{runtime/include/runtime => local-execution/include}/profiling.h (57%) rename lib/{runtime/src/task_spec => local-execution/include}/runtime_arg_ref.h (56%) rename lib/{runtime/src => local-execution/include}/serialization.h (55%) rename lib/{runtime/src => local-execution/include}/sim_environment.h (95%) rename lib/{runtime/include/runtime/task_spec => local-execution/include}/slot_id.h (73%) rename lib/{runtime/src/task_spec => local-execution/include}/slot_type.h (86%) create mode 100644 lib/local-execution/include/task_argument_accessor.h rename lib/{runtime/src => local-execution/include}/tasks.h (95%) rename lib/{runtime/src/task_spec => local-execution/include}/variadic_tensor_ref.h (72%) rename lib/{runtime/src/task_spec => local-execution/src}/op_task_invocation.cc (85%) create mode 100644 lib/local-execution/src/op_task_signature.cc rename lib/{runtime => local-execution}/src/permissions.cc (67%) delete mode 100644 lib/runtime/include/runtime/task_spec/concrete_arg.h delete mode 100644 lib/runtime/src/task_spec/task_argument_accessor.h diff --git a/lib/runtime/src/task_spec/arg_ref.h b/lib/local-execution/include/arg_ref.h similarity index 52% rename from lib/runtime/src/task_spec/arg_ref.h rename to lib/local-execution/include/arg_ref.h index 62f89f0b5c..67e8a47404 100644 --- a/lib/runtime/src/task_spec/arg_ref.h +++ b/lib/local-execution/include/arg_ref.h @@ -1,9 +1,9 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_ARG_REF_H -#define _FLEXFLOW_RUNTIME_SRC_ARG_REF_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H #include "kernels/ff_handle.h" -#include "runtime/profiling.h" -#include "runtime/task_spec/arg_type_runtime_tag.h" +#include "profiling.h" +#include "serialization.h" #include "utils/type_index.h" #include "utils/visitable.h" @@ -21,37 +21,43 @@ struct ArgRefSpec { template bool holds() const { - return this->type_tag.template matches(); + // return this->type_tag.template matches(); + + return matches(this->type_idx); } LABEL_TYPE const &get_ref_type() const { return this->ref_type; } - ArgTypeRuntimeTag get_type_tag() const { - return this->type_tag; + // TODO - how to extend this for legion runtime? + // ArgTypeRuntimeTag get_type_tag() const { + // return this->type_tag; + // } + std::type_index get_type_index() const { + return this->type_idx; } template static ArgRefSpec create(ArgRef const &r) { static_assert(is_serializable::value, "Type must be serializeable"); - return ArgRefSpec(ArgTypeRuntimeTag::create(), r.ref_type); + return ArgRefSpec(init_type_index(), r.ref_type); } template static ArgRefSpec create_device_specific(ArgRef const &r, size_t device_idx) { - return ArgRefSpec(ArgTypeRuntimeTag::create(), r.ref_type, device_idx); + return ArgRefSpec(init_type_index(), r.ref_type, device_idx); } private: - ArgRefSpec(ArgTypeRuntimeTag const &type_tag, LABEL_TYPE ref_type) - : type_tag(type_tag), ref_type(ref_type) {} + ArgRefSpec(std::type_index const &type_index, LABEL_TYPE ref_type) + : type_idx(type_index), ref_type(ref_type) {} - ArgTypeRuntimeTag type_tag; + std::type_index type_idx; LABEL_TYPE ref_type; - optional device_idx = nullopt; + std::optional device_idx = std::nullopt; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/concrete_arg.h b/lib/local-execution/include/concrete_arg.h new file mode 100644 index 0000000000..522d21485e --- /dev/null +++ b/lib/local-execution/include/concrete_arg.h @@ -0,0 +1,55 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H +#define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H + +#include "serialization.h" +#include "utils/type_index.h" +#include + +namespace FlexFlow { + +struct ConcreteArgSpec { +public: + ConcreteArgSpec() = delete; + + template + T const &get() const { + assert(matches(this->type_idx)); + + return *(T const *)ptr.get(); + } + + // ArgTypeRuntimeTag get_type_tag() const { + // return this->type_tag; + // } + // size_t serialize(Legion::Serializer &) const; + + std::type_index get_type_index() const { + return this->type_idx; + } + + template + static ConcreteArgSpec create(T const &t) { + static_assert(is_serializable::value, "Type must be serializable"); + + std::type_index type_idx = init_type_index(); + std::shared_ptr ptr = + std::static_pointer_cast(std::make_shared(t)); + + return ConcreteArgSpec(type_idx, ptr); + // ArgTypeRuntimeTag::create()); + } + +private: + ConcreteArgSpec(std::type_index const &type_index, + std::shared_ptr ptr) + : type_idx(type_index), ptr(ptr) {} + // ArgTypeRuntimeTag const &); + + // ArgTypeRuntimeTag type_tag; + std::type_index type_idx; + std::shared_ptr ptr; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/include/runtime/config.h b/lib/local-execution/include/config.h similarity index 89% rename from lib/runtime/include/runtime/config.h rename to lib/local-execution/include/config.h index 34f45040d1..73653aebae 100644 --- a/lib/runtime/include/runtime/config.h +++ b/lib/local-execution/include/config.h @@ -13,12 +13,11 @@ * limitations under the License. */ -#ifndef _FLEXFLOW_CONFIG_H_ -#define _FLEXFLOW_CONFIG_H_ -#include "legion.h" +#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_ +#define _FLEXFLOW_LOCAL_EXECUTION_CONFIG_H_ + #include "op-attrs/param_sync.h" #include "utils/fmt.h" -#include "utils/optional.h" #include "utils/visitable.h" #include @@ -47,6 +46,8 @@ struct FFInitInfo : public use_visitable_cmp { bool allowTensorOpMathConversion; }; +using legion_mapping_tag_id_t = unsigned long; + struct FFConfig : public use_visitable_cmp { public: enum PreservedIDs { @@ -64,7 +65,7 @@ struct FFConfig : public use_visitable_cmp { }; FFConfig() = default; - static Legion::MappingTagID get_hash_id(std::string const &pcname); + static legion_mapping_tag_id_t get_hash_id(std::string const &pcname); public: int epochs = 1; @@ -88,16 +89,17 @@ struct FFConfig : public use_visitable_cmp { bool enable_inplace_optimizations = false; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion = false; - optional dataset_path = nullopt; - optional export_strategy_computation_graph_file = nullopt; + std::optional dataset_path = std::nullopt; + std::optional export_strategy_computation_graph_file = + std::nullopt; bool include_costs_dot_graph = false; - optional substitution_json_path = nullopt; + std::optional substitution_json_path = std::nullopt; int machine_model_version = 0; - optional machine_model_file = nullopt; + std::optional machine_model_file = std::nullopt; int simulator_segment_size = 16777216; // 16 MB int simulator_max_num_segments = 1; - optional search_num_nodes = nullopt; - optional search_num_workers = nullopt; + std::optional search_num_nodes = std::nullopt; + std::optional search_num_workers = std::nullopt; int base_optimize_threshold = 10; bool enable_control_replication = true; // The default python data loader type is 2 to enable control replication diff --git a/lib/runtime/src/cost_metrics.h b/lib/local-execution/include/cost_metrics.h similarity index 95% rename from lib/runtime/src/cost_metrics.h rename to lib/local-execution/include/cost_metrics.h index 77526ccd1a..edc0190daf 100644 --- a/lib/runtime/src/cost_metrics.h +++ b/lib/local-execution/include/cost_metrics.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H -#define _FLEXFLOW_RUNTIME_SRC_COST_METRICS_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H +#define _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H #include "utils/visitable.h" diff --git a/lib/runtime/src/task_spec/device_specific.h b/lib/local-execution/include/device_specific.h similarity index 67% rename from lib/runtime/src/task_spec/device_specific.h rename to lib/local-execution/include/device_specific.h index e29e4e9450..a055f6d274 100644 --- a/lib/runtime/src/task_spec/device_specific.h +++ b/lib/local-execution/include/device_specific.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H -#define _FLEXFLOW_RUNTIME_SRC_DEVICE_SPECIFIC_ARG_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H +#define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H #include "serialization.h" #include "utils/exception.h" @@ -10,10 +10,17 @@ template struct DeviceSpecific { DeviceSpecific() = delete; + DeviceSpecific(T ptr_type) { // accessor + size_t device_idx = 0; + DeviceSpecific device_specific = + DeviceSpecific::create(device_idx, ptr_type); + this->ptr = device_specific.ptr; + this->device_idx = device_specific.device_idx; + } template static DeviceSpecific create(size_t device_idx, Args &&...args) { - NOT_IMPLEMENTED(); + NOT_IMPLEMENTED(); // accessor } T const *get(size_t curr_device_idx) const { @@ -26,6 +33,8 @@ struct DeviceSpecific { return this->ptr; } + // TODO: can modify ptr + private: T *ptr; size_t device_idx; diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local_allocator.h index f4b253b281..b47220eb8c 100644 --- a/lib/local-execution/include/local_allocator.h +++ b/lib/local-execution/include/local_allocator.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H -#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H #include "kernels/allocation.h" #include diff --git a/lib/runtime/src/task_spec/op_arg_ref.h b/lib/local-execution/include/op_arg_ref.h similarity index 84% rename from lib/runtime/src/task_spec/op_arg_ref.h rename to lib/local-execution/include/op_arg_ref.h index 3e931d79a4..02b354b221 100644 --- a/lib/runtime/src/task_spec/op_arg_ref.h +++ b/lib/local-execution/include/op_arg_ref.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_ARG_REF_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H #include "arg_ref.h" #include "device_specific.h" diff --git a/lib/runtime/src/task_spec/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h similarity index 50% rename from lib/runtime/src/task_spec/op_task_invocation.h rename to lib/local-execution/include/op_task_invocation.h index 56e709734e..2079fabcbc 100644 --- a/lib/runtime/src/task_spec/op_task_invocation.h +++ b/lib/local-execution/include/op_task_invocation.h @@ -1,49 +1,46 @@ #ifndef _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H #define _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H -#include "accessor.h" -#include "index_task_invocation.h" -#include "legion.h" +#include "concrete_arg.h" +#include "kernels/accessor.h" #include "op_arg_ref.h" #include "op_task_signature.h" #include "op_tensor_spec.h" -#include "runtime/config.h" -#include "runtime/profiling.h" +#include "profiling.h" +#include "runtime_arg_ref.h" #include "serialization.h" -#include "standard_task_invocation.h" #include "tasks.h" #include "utils/bidict.h" -#include "utils/optional.h" #include "utils/stack_map.h" #include "variadic_tensor_ref.h" #include #include #include +#include namespace FlexFlow { enum class IsTrainable { YES, NO }; -using OpArgSpec = variant; +using OpArgSpec = + std::variant; + +struct OpArgSpecTypeAccessor { + std::type_index operator()(OpArgSpec &spec) { + return std::visit( + [](auto &&arg) -> std::type_index { return arg.get_type_index(); }, + spec); + } +}; struct OpTaskBinding { OpTaskBinding() = default; - static_assert(is_subeq_variant::value, ""); - - void bind(slot_id, OpTensorSpec const &); - void bind_grad(slot_id, OpTensorSpec const &); - - template - void bind(slot_id name, VariadicTensorRef const &t) { + void bind(slot_id, VariadicTensorRef const &) { NOT_IMPLEMENTED(); } + void bind(slot_id, OpTensorSpec const &); + void bind_grad(slot_id, OpTensorSpec const &); template void bind_device_specific_arg(slot_id name, T const &t) { @@ -70,46 +67,31 @@ struct OpTaskBinding { this->insert_arg_spec(name, OpArgRefSpec::create(ref)); } - template - void bind_arg(slot_id name, TypedFuture const &f) { - this->insert_arg_spec(name, CheckedTypedFuture::create(f)); + void bind_args_from_fwd(OpTaskBinding const &fwd) { + this->arg_bindings = fwd.get_arg_bindings(); } - template - void bind_arg(slot_id name, TypedFutureMap const &fm) { - this->insert_arg_spec(name, CheckedTypedFutureMap::create(fm)); + void bind_tensors_from_fwd(OpTaskBinding const &fwd) { + this->tensor_bindings = fwd.get_tensor_bindings(); } std::unordered_map, OpTensorSpec> const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; -private: void insert_arg_spec(slot_id name, OpArgSpec const &arg_spec) { assert(!contains_key(this->arg_bindings, name)); this->arg_bindings.insert({name, arg_spec}); } - // template - // ArgSpec generate_arg_spec(T const &t) { - // static_assert(is_serializable, "Type must be serializable"); - - // size_t pre_size = serializer.get_used_bytes(); - // ff_task_serialize(serializer, t); - // size_t post_size = serializer.get_used_bytes(); - // return { - // typeid(T), - // pre_size, - // post_size - pre_size - // }; - // } - - /* Legion::Serializer serializer; */ std::unordered_map arg_bindings; std::unordered_map, OpTensorSpec> tensor_bindings; }; +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskBinding, + arg_bindings, + tensor_bindings); -struct OpTaskInvocation : public use_visitable_cmp { +struct OpTaskInvocation { public: OpTaskInvocation() = delete; OpTaskInvocation(task_id_t const &task_id, OpTaskBinding const &binding) @@ -119,16 +101,41 @@ struct OpTaskInvocation : public use_visitable_cmp { task_id_t task_id; OpTaskBinding binding; }; +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskInvocation, + task_id, + binding); OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd); OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); -OpTaskSignature get_op_signature(task_id_t const &); -/* std::unordered_map get_regions_idxs(TaskArgumentFormat - * const &); */ +bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) { + // tensors + auto tensor_bindings = inv.binding.get_tensor_bindings(); + for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { + slot_id name = op_tensor_slot_spec.name; + IsGrad is_grad = op_tensor_slot_spec.is_grad; + std::pair tensor_key = std::make_pair(name, is_grad); + OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key); + if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role || + op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) { + return false; + } + } + + // args + auto sig_arg_types = sig.get_arg_types(); + OpArgSpecTypeAccessor type_accessor; + for (auto arg_binding : inv.binding.get_arg_bindings()) { + slot_id name = arg_binding.first; + OpArgSpec op_arg_spec = arg_binding.second; + std::type_index arg_type = sig_arg_types.at(name); + if (type_accessor(op_arg_spec) != arg_type) { + return false; + } + } -/* TaskArgumentFormat compile_task_invocation(OpTaskSignature const &, - * OpTaskBinding const &); */ + return true; +} } // namespace FlexFlow diff --git a/lib/runtime/src/task_spec/op_task_signature.h b/lib/local-execution/include/op_task_signature.h similarity index 71% rename from lib/runtime/src/task_spec/op_task_signature.h rename to lib/local-execution/include/op_task_signature.h index 656df39309..626266d10f 100644 --- a/lib/runtime/src/task_spec/op_task_signature.h +++ b/lib/local-execution/include/op_task_signature.h @@ -1,8 +1,11 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H -#define _FLEXFLOW_RUNTIME_SRC_OP_TASK_SIGNATURE_H - -#include "task_invocation.h" -#include "task_signature.h" +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H + +#include "serialization.h" +#include "slot_id.h" +#include "slot_type.h" +#include "tasks.h" +#include "utils/type_index.h" #include "utils/visitable.h" namespace FlexFlow { @@ -14,6 +17,7 @@ enum class TensorRole { }; enum class OpTaskType { INIT, FWD, BWD }; +enum class IsGrad { YES, NO }; enum class OpSlotOptions { OPTIONAL, @@ -25,7 +29,6 @@ enum class OpSlotOptions { struct OpTensorSlotSpec { public: OpTensorSlotSpec() = delete; - OpTensorSlotSpec(slot_id, SlotType, TensorRole); public: slot_id name; @@ -38,10 +41,12 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( OpTensorSlotSpec, name, slot_type, tensor_role, is_grad, slot_option); struct OpTaskSignature { - OpTaskSignature() = delete; - explicit OpTaskSignature(OpTaskType); + OpTaskSignature() = default; + // explicit OpTaskSignature(OpTaskType); - OpTaskType get_task_type() const; + OpTaskType get_task_type() const { + return this->type; + } void add_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR); void add_optional_input_slot(slot_id, SlotType slot_type = SlotType::TENSOR); @@ -59,45 +64,39 @@ struct OpTaskSignature { void add_from_slot_spec(OpTensorSlotSpec const &spec); - /* void add_input_slot(slot_id, Legion::PrivilegeMode); */ - /* void add_input_slot(slot_id, SlotType, Legion::PrivilegeMode); */ - - bool operator==(OpTaskSignature const &) const; - bool operator!=(OpTaskSignature const &) const; - template void add_arg_slot(slot_id name) { static_assert(is_serializable::value, "Type must be serializable"); + this->task_arg_types.insert({name, init_type_index()}); } template - void add_return_value(); + void add_return_value() { + // std::type_index return_value = init_type_index(); + this->return_value = init_type_index(); + } // adds arg_slot without checking is_serializable, used for arguments that are // deviceSpecific template void add_unchecked_arg_slot(slot_id name) { - NOT_IMPLEMENTED(); + this->task_arg_types.insert({name, init_type_index()}); } std::unordered_set get_tensor_slots(); void set_arg_types(std::unordered_map const &); std::unordered_map get_arg_types(); -private: + OpTaskType type; + std::optional return_value; std::unordered_map task_arg_types; std::unordered_set op_tensor_slots; }; - -template -OpTaskSignature init_signature(); -template -OpTaskSignature fwd_signature(); -template -OpTaskSignature bwd_signature(); - -template -OpTaskSignature get_signature(); +// FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskSignature, +// type, +// return_value, +// task_arg_types, +// op_tensor_slots); template void register_task(task_id_t, @@ -112,6 +111,15 @@ void register_task(task_id_t, F const &func, F const &cpu_func); +template +OpTaskSignature init_signature(); + +template +OpTaskSignature fwd_signature(); + +template +OpTaskSignature bwd_signature(); + } // namespace FlexFlow #endif diff --git a/lib/runtime/src/task_spec/op_tensor_spec.h b/lib/local-execution/include/op_tensor_spec.h similarity index 55% rename from lib/runtime/src/task_spec/op_tensor_spec.h rename to lib/local-execution/include/op_tensor_spec.h index d859bb3072..c12b5342e1 100644 --- a/lib/runtime/src/task_spec/op_tensor_spec.h +++ b/lib/local-execution/include/op_tensor_spec.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_OP_TENSOR_SPEC_REF_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H #include "op_task_signature.h" @@ -7,9 +7,10 @@ namespace FlexFlow { struct OpTensorSpec { TensorRole role; + OpSlotOptions slot_option; req idx; }; -FF_VISITABLE_STRUCT(OpTensorSpec, role, idx); +FF_VISITABLE_STRUCT(OpTensorSpec, role, slot_option, idx); OpTensorSpec input_tensor(int); OpTensorSpec output_tensor(int); diff --git a/lib/runtime/src/permissions.h b/lib/local-execution/include/permissions.h similarity index 84% rename from lib/runtime/src/permissions.h rename to lib/local-execution/include/permissions.h index e7793a1dcb..ce19e38e7e 100644 --- a/lib/runtime/src/permissions.h +++ b/lib/local-execution/include/permissions.h @@ -1,18 +1,13 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_PERMISSION_H -#define _FLEXFLOW_RUNTIME_SRC_PERMISSION_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H +#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H -#include "legion.h" #include "utils/exception.h" #include "utils/fmt.h" -#include "utils/optional.h" namespace FlexFlow { enum class Permissions { NONE, RO, WO, RW }; -Legion::PrivilegeMode to_legion(Permissions); -optional from_legion(Legion::PrivilegeMode); - Permissions join(Permissions lhs, Permissions rhs); Permissions meet(Permissions lhs, Permissions rhs); diff --git a/lib/runtime/include/runtime/profiling.h b/lib/local-execution/include/profiling.h similarity index 57% rename from lib/runtime/include/runtime/profiling.h rename to lib/local-execution/include/profiling.h index 3f43ede520..066cdc8404 100644 --- a/lib/runtime/include/runtime/profiling.h +++ b/lib/local-execution/include/profiling.h @@ -1,22 +1,21 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_PROFILING_H -#define _FLEXFLOW_RUNTIME_SRC_PROFILING_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H +#define _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H #include "kernels/profiling.h" -#include "legion.h" -#include "loggers.h" namespace FlexFlow { enum class EnableProfiling { YES, NO }; template -optional +std::optional profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) { - optional elapsed = + std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); - if (elapsed.has_value()) { - log_profile.debug(s, elapsed.value()); - } + // TODO -- local logger? + // if (elapsed.has_value()) { + // log_profile.debug(s, elapsed.value()); + // } return elapsed; } diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.h b/lib/local-execution/include/runtime_arg_ref.h similarity index 56% rename from lib/runtime/src/task_spec/runtime_arg_ref.h rename to lib/local-execution/include/runtime_arg_ref.h index 655300e692..1493531dc3 100644 --- a/lib/runtime/src/task_spec/runtime_arg_ref.h +++ b/lib/local-execution/include/runtime_arg_ref.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H #include "arg_ref.h" +#include "config.h" #include "device_specific.h" -#include "runtime/config.h" namespace FlexFlow { @@ -18,9 +18,17 @@ using RuntimeArgRef = ArgRef; using RuntimeArgRefSpec = ArgRefSpec; -RuntimeArgRef profiling_settings(); -RuntimeArgRef> ff_handle(); -RuntimeArgRef iteration_config(); +RuntimeArgRef profiling_settings() { + return {RuntimeArgRefType::PROFILING_SETTINGS}; +} + +RuntimeArgRef> ff_handle() { + return {RuntimeArgRefType::FF_HANDLE}; +} + +RuntimeArgRef> ff_handle() { + return {RuntimeArgRefType::FF_ITERATION_CONFIG}; +} } // namespace FlexFlow diff --git a/lib/runtime/src/serialization.h b/lib/local-execution/include/serialization.h similarity index 55% rename from lib/runtime/src/serialization.h rename to lib/local-execution/include/serialization.h index 65601990b0..147ed8159c 100644 --- a/lib/runtime/src/serialization.h +++ b/lib/local-execution/include/serialization.h @@ -1,12 +1,9 @@ -#ifndef _FLEXFLOW_RUNTIME_SERIALIZATION_H -#define _FLEXFLOW_RUNTIME_SERIALIZATION_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_SERIALIZATION_H #include "kernels/device.h" #include "kernels/nccl.h" -#include "legion.h" -#include "legion/legion_utilities.h" #include "op-attrs/dim_ordered.h" -#include "utils/optional.h" #include "utils/required.h" #include "utils/type_traits.h" #include "utils/variant.h" @@ -28,23 +25,6 @@ namespace FlexFlow { template struct needs_serialization {}; -/* template */ -/* class Serializer { */ -/* void serialize(Legion::Serializer &, T const &) const; */ -/* void deserialize(Legion::Deserializer &, T &) const; */ -/* }; */ - -/* template struct trivially_serializable; */ - -/* template struct - * visit_trivially_serializable; */ - -/* template >::value && - * visit_serializable::value)>::type> */ - template struct visit_trivially_serializable; @@ -101,6 +81,10 @@ struct is_trivially_serializable< typename std::enable_if::value>::type> : std::true_type {}; +template +struct is_trivially_serializable> + : is_trivially_serializable {}; + template struct is_trivially_serializable> : is_trivially_serializable {}; @@ -155,108 +139,6 @@ static_assert(std::is_same, static_assert(visit_trivially_serializable::value, ""); static_assert(is_trivially_serializable::value, ""); -template -struct Serialization { - void serialize(Legion::Serializer &, T const &) const; - T deserialize(Legion::Deserializer &) const; -}; - -template -struct Serialization< - T, - typename std::enable_if::value>::type> { - static void serialize(Legion::Serializer &sez, T const &t) { - sez.serialize(&t, sizeof(T)); - } - - static T const &deserialize(Legion::Deserializer &dez) { - void const *cur = dez.get_current_pointer(); - dez.advance_pointer(sizeof(T)); - return *(T const *)cur; - } -}; - -struct needs_serialize_visitor { - bool result = true; - - template - void operator()(char const *, T const &t) { - result &= needs_serialize(t); - } -}; - -template -bool visit_needs_serialize(T const &t) { - needs_serialize_visitor vis; - visit_struct::for_each(t, vis); - return vis.result; -} - -struct serialize_visitor { - serialize_visitor() = delete; - explicit serialize_visitor(Legion::Serializer &sez) : sez(sez) {} - - Legion::Serializer &sez; - - template - void operator()(char const *, T const &t) { - serialize(this->sez, t); - } -}; - -template -void visit_serialize(Legion::Serializer &sez, T const &t) { - serialize_visitor vis(sez); - visit_struct::for_each(t, vis); -} - -struct deserialize_visitor { - deserialize_visitor() = delete; - explicit deserialize_visitor(Legion::Deserializer &dez) : dez(dez) {} - - Legion::Deserializer &dez; - - template - T const &operator()(char const *, T &t) { - deserialize(dez, t); - } -}; - -template -T const &visit_deserialize(Legion::Deserializer &dez) { - deserialize_visitor vis(dez); - return visit_struct::for_each(vis); -} - -template -class VisitSerialize { - void serialize(Legion::Serializer &sez, T const &t) const { - return visit_serialize(sez, t); - } - - T const &deserialize(Legion::Deserializer &dez) const { - return visit_deserialize(dez); - } -}; - -template -size_t ff_task_serialize(Legion::Serializer &sez, T const &t) { - static_assert(is_serializable::value, "Type must be serializable"); - - size_t pre_size = sez.get_used_bytes(); - Serialization::serialize(sez, t); - size_t post_size = sez.get_used_bytes(); - - return post_size - pre_size; -} - -template -T const &ff_task_deserialize(Legion::Deserializer &dez) { - static_assert(is_serializable::value, "Type must be serializable"); - - return Serialization::deserialize(dez); -} - } // namespace FlexFlow #endif diff --git a/lib/runtime/src/sim_environment.h b/lib/local-execution/include/sim_environment.h similarity index 95% rename from lib/runtime/src/sim_environment.h rename to lib/local-execution/include/sim_environment.h index 4297d9d970..4409ab8b55 100644 --- a/lib/runtime/src/sim_environment.h +++ b/lib/local-execution/include/sim_environment.h @@ -1,12 +1,13 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H -#define _FLEXFLOW_RUNTIME_SRC_OPS_SIM_ENVIRONMENT_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H +#define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H #include "cost_metrics.h" #include "kernels/accessor.h" #include "kernels/allocation.h" #include "op-attrs/parallel_tensor_shape.h" -#include "task_spec/op_task_invocation.h" -#include "task_spec/task_argument_accessor.h" +#include "op_task_invocation.h" +#include "pcg/machine_view.h" +#include "task_argument_accessor.h" #include namespace FlexFlow { diff --git a/lib/runtime/include/runtime/task_spec/slot_id.h b/lib/local-execution/include/slot_id.h similarity index 73% rename from lib/runtime/include/runtime/task_spec/slot_id.h rename to lib/local-execution/include/slot_id.h index a5e4322d3c..53820fdb2f 100644 --- a/lib/runtime/include/runtime/task_spec/slot_id.h +++ b/lib/local-execution/include/slot_id.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_ID_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SPEC_SLOT_ID_H #include "utils/strong_typedef.h" diff --git a/lib/runtime/src/task_spec/slot_type.h b/lib/local-execution/include/slot_type.h similarity index 86% rename from lib/runtime/src/task_spec/slot_type.h rename to lib/local-execution/include/slot_type.h index 64b79ee281..957f89fa4e 100644 --- a/lib/runtime/src/task_spec/slot_type.h +++ b/lib/local-execution/include/slot_type.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_SLOT_TYPE_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H +#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_TYPE_H #include "utils/fmt.h" diff --git a/lib/local-execution/include/task_argument_accessor.h b/lib/local-execution/include/task_argument_accessor.h new file mode 100644 index 0000000000..0656af0fe3 --- /dev/null +++ b/lib/local-execution/include/task_argument_accessor.h @@ -0,0 +1,155 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H + +#include "arg_ref.h" +#include "concrete_arg.h" +#include "config.h" +#include "device_specific.h" +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/linear_kernels.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "op_task_signature.h" +#include "permissions.h" +#include "tasks.h" +#include "utils/variant.h" +#include +#include +#include +#include +#include +#include + +namespace FlexFlow { + +template +struct privilege_mode_to_accessor_t {}; + +template <> +struct privilege_mode_to_accessor_t { + using type = GenericTensorAccessorW; +}; + +template <> +struct privilege_mode_to_accessor_t { + using type = GenericTensorAccessorR; +}; + +template <> +struct privilege_mode_to_accessor_t { + using type = GenericTensorAccessorW; +}; + +template +using privilege_mode_to_accessor = + typename privilege_mode_to_accessor_t::type; + +using PrivilegeType = + std::variant; +using PrivilegeVariadicType = std::variant, + std::vector>; + +// TODO: define device state variant in another file +using DeviceStates = std::variant; + +using OpArgRefTypeBacking = + std::variant>; +using RuntimeArgRefTypeBacking = std::variant, + FFIterationConfig>; + +using ArgRefBacking = std:: + variant; + +struct ITaskArgumentAccessor { + ITaskArgumentAccessor &operator=(ITaskArgumentAccessor const &) = delete; + + virtual ~ITaskArgumentAccessor() = default; + + virtual ConcreteArgSpec const &get_concrete_arg(slot_id) const = 0; + virtual OpArgRefTypeBacking const &get_op_arg_ref(slot_id) const = 0; + virtual RuntimeArgRefTypeBacking const &get_runtime_arg(slot_id) const = 0; + + virtual PrivilegeType + get_tensor(slot_id slot, Permissions priv, IsGrad is_grad) const = 0; + virtual PrivilegeVariadicType get_variadic_tensor(slot_id slot, + Permissions priv, + IsGrad is_grad) const = 0; + + virtual Allocator get_allocator() const = 0; + virtual size_t get_device_idx() const = 0; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(ITaskArgumentAccessor); + +struct TaskArgumentAccessor { + template + T const &get_argument(slot_id slot) const { + if constexpr (is_in_variant::value) { + return std::get(this->ptr->get_op_arg_ref(slot)); + } else if constexpr (is_in_variant::value) { + return std::get(this->ptr->get_runtime_arg(slot)); + } else { + return this->ptr->get_concrete_arg(slot).get(); + } + } + + template + privilege_mode_to_accessor get_tensor(slot_id slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, IsGrad::NO)); + } + + template + privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, IsGrad::YES)); + } + + template + std::vector> + get_variadic_tensor(slot_id slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO)); + } + + template + std::vector> + get_variadic_tensor_grad(slot_id slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES)); + } + + Allocator get_allocator() const { + return this->ptr->get_allocator(); + } + + template + static + typename std::enable_if::value, + TaskArgumentAccessor>::type + create(Args &&...args) { + return TaskArgumentAccessor( + std::make_shared(std::forward(args)...)); + } + +private: + TaskArgumentAccessor(std::shared_ptr ptr) + : ptr(ptr) {} + std::shared_ptr ptr; +}; + +using DeviceStates = std::variant; + +using TaskImplFunction = std::variant< + std::function, + std::function(TaskArgumentAccessor const &)>>; + +template +TaskImplFunction get_task_impl(); + +template +OpTaskSignature get_signature(); + +} // namespace FlexFlow + +#endif diff --git a/lib/runtime/src/tasks.h b/lib/local-execution/include/tasks.h similarity index 95% rename from lib/runtime/src/tasks.h rename to lib/local-execution/include/tasks.h index 0e07fa3f85..d8fdc93f39 100644 --- a/lib/runtime/src/tasks.h +++ b/lib/local-execution/include/tasks.h @@ -1,8 +1,9 @@ -#ifndef _FLEXFLOW_TASKS_H -#define _FLEXFLOW_TASKS_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H -#include "utils/optional.h" #include +#include +#include namespace FlexFlow { @@ -170,9 +171,9 @@ template void register_task(task_id_t, std::string const &name, F const &func, - optional cpu_func = nullopt); + std::optional cpu_func = std::nullopt); -template +template void register_task(); void register_tasks(); diff --git a/lib/runtime/src/task_spec/variadic_tensor_ref.h b/lib/local-execution/include/variadic_tensor_ref.h similarity index 72% rename from lib/runtime/src/task_spec/variadic_tensor_ref.h rename to lib/local-execution/include/variadic_tensor_ref.h index a9d1b54731..077c989c95 100644 --- a/lib/runtime/src/task_spec/variadic_tensor_ref.h +++ b/lib/local-execution/include/variadic_tensor_ref.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_VARIADIC_TENSOR_ARG_REF_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H +#define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H #include "arg_ref.h" #include "op_tensor_spec.h" diff --git a/lib/runtime/src/task_spec/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc similarity index 85% rename from lib/runtime/src/task_spec/op_task_invocation.cc rename to lib/local-execution/src/op_task_invocation.cc index fbbfe47726..5342e46b42 100644 --- a/lib/runtime/src/task_spec/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -1,5 +1,4 @@ #include "op_task_invocation.h" -#include "task_argument_accessor.h" namespace FlexFlow { @@ -39,9 +38,4 @@ std::unordered_map, OpTensorSpec> const & return this->tensor_bindings; } -std::unordered_map const & - OpTaskBinding::get_arg_bindings() const { - return this->arg_bindings; -} - } // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc new file mode 100644 index 0000000000..bc3eaa12db --- /dev/null +++ b/lib/local-execution/src/op_task_signature.cc @@ -0,0 +1,81 @@ +#include "op_task_signature.h" + +namespace FlexFlow { + +// OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){}; + +void OpTaskSignature::add_input_slot(slot_id name, SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_optional_input_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_untrainable_input_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::INPUT, + IsGrad::NO, + OpSlotOptions::UNTRAINABLE}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_optional_untrainable_input_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::INPUT, + IsGrad::NO, + OpSlotOptions::OPTIONAL_UNTRAINABLE}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_output_slot(slot_id name, SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_bwd_necessary_output_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::OUTPUT, + IsGrad::NO, + OpSlotOptions::NECESSARY}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_weight_slot(slot_id name, SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = {name, + slot_type, + TensorRole::WEIGHT, + IsGrad::NO, + OpSlotOptions::NECESSARY}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::add_optional_weight_slot(slot_id name, + SlotType slot_type) { + OpTensorSlotSpec op_tensor_slot_spec = { + name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + this->op_tensor_slots.insert(op_tensor_slot_spec); +} + +void OpTaskSignature::set_arg_types( + std::unordered_map const &arg_type) { + this->task_arg_types = arg_type; +} + +void OpTaskSignature::add_from_slot_spec(OpTensorSlotSpec const &spec) { + this->op_tensor_slots.insert(spec); +} + +} // namespace FlexFlow diff --git a/lib/runtime/src/permissions.cc b/lib/local-execution/src/permissions.cc similarity index 67% rename from lib/runtime/src/permissions.cc rename to lib/local-execution/src/permissions.cc index 2992780ae1..2843dd1b70 100644 --- a/lib/runtime/src/permissions.cc +++ b/lib/local-execution/src/permissions.cc @@ -3,36 +3,6 @@ namespace FlexFlow { -Legion::PrivilegeMode to_legion(Permissions p) { - switch (p) { - case Permissions::NONE: - return LEGION_NO_ACCESS; - case Permissions::RO: - return LEGION_READ_ONLY; - case Permissions::WO: - return LEGION_WRITE_ONLY; - case Permissions::RW: - return LEGION_READ_WRITE; - default: - throw mk_runtime_error("Unknown permission {}", static_cast(p)); - } -} - -optional from_legion(Legion::PrivilegeMode p) { - switch (p) { - case LEGION_NO_ACCESS: - return Permissions::NONE; - case LEGION_READ_ONLY: - return Permissions::RO; - case LEGION_WRITE_ONLY: - return Permissions::WO; - case LEGION_READ_WRITE: - return Permissions::RW; - default: - return nullopt; - } -} - Permissions join(Permissions lhs, Permissions rhs) { if (lhs <= rhs) { return rhs; diff --git a/lib/runtime/include/runtime/task_spec/concrete_arg.h b/lib/runtime/include/runtime/task_spec/concrete_arg.h deleted file mode 100644 index 1d973eb81a..0000000000 --- a/lib/runtime/include/runtime/task_spec/concrete_arg.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H -#define _FLEXFLOW_RUNTIME_INCLUDE_RUNTIME_TASK_SPEC_CONCRETE_ARG_H - -#include "arg_type_runtime_tag.h" -#include "utils/type_index.h" -#include - -namespace FlexFlow { - -struct ConcreteArgSpec { -public: - ConcreteArgSpec() = delete; - - template - T const &get() { - assert(this->type_tag.matches()); - - return *(T const *)ptr.get(); - } - - ArgTypeRuntimeTag get_type_tag() const { - return this->type_tag; - } - size_t serialize(Legion::Serializer &) const; - - template - static ConcreteArgSpec create(T const &t) { - static_assert(is_serializable::value, "Type must be serializable"); - - return ConcreteArgSpec(type_index(), - std::make_shared(t), - ArgTypeRuntimeTag::create()); - } - -private: - ConcreteArgSpec(std::type_index, - std::shared_ptr, - ArgTypeRuntimeTag const &); - - ArgTypeRuntimeTag type_tag; - std::shared_ptr ptr; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/runtime/src/task_spec/task_argument_accessor.h b/lib/runtime/src/task_spec/task_argument_accessor.h deleted file mode 100644 index 9cc05b8252..0000000000 --- a/lib/runtime/src/task_spec/task_argument_accessor.h +++ /dev/null @@ -1,193 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_RUNTIME_SRC_TASK_ARGUMENT_ACCESSOR_H - -#include "accessor.h" -#include "device_specific.h" -#include "realm_allocator.h" -#include "runtime/config.h" -#include "utils/exception.h" -#include "utils/stack_map.h" -#include "utils/strong_typedef.h" -#include - -namespace FlexFlow { - -struct region_idx_t : strong_typedef { - using strong_typedef::strong_typedef; -}; - -FF_TYPEDEF_HASHABLE(region_idx_t); -FF_TYPEDEF_PRINTABLE(region_idx_t, "region_idx"); - -using NonvariadicFormat = region_idx_t; -using VariadicFormat = std::vector; - -using TensorArgumentFormat = variant; - -bool is_variadic(TensorArgumentFormat const &); -VariadicFormat get_variadic_format(TensorArgumentFormat const &); -NonvariadicFormat get_nonvariadic_format(TensorArgumentFormat const &); - -struct TaskArgumentFormat { - std::type_index type; - size_t start; - req end; -}; -FF_VISITABLE_STRUCT(TaskArgumentFormat, type, start, end); - -struct FutureArgumentFormat { - std::type_index type; - req future_idx; -}; -FF_VISITABLE_STRUCT(FutureArgumentFormat, type, future_idx); - -struct TaskArgumentsFormat { - TaskArgumentsFormat() = default; - - stack_map region_idxs; - stack_map args; - stack_map futures; - stack_map regions; - stack_map data_types; - - void insert(std::pair const &); - void insert(std::pair const &); - - void insert(region_idx_t, Legion::PrivilegeMode, DataType); - void insert(slot_id, region_idx_t); - void insert(slot_id, std::vector const &); -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( - TaskArgumentsFormat, region_idxs, args, futures, regions, data_types); - -Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &, - region_idx_t const &); -Legion::PrivilegeMode get_privileges(TaskArgumentsFormat const &, - parallel_tensor_guid_t const &); -Permissions get_permissions(TaskArgumentsFormat const &, region_idx_t const &); -Permissions get_permissions(TaskArgumentsFormat const &, - parallel_tensor_guid_t const &); -region_idx_t get_region_idx(TaskArgumentsFormat const &, - parallel_tensor_guid_t const &); -DataType get_datatype(TaskArgumentsFormat const &, region_idx_t const &); - -struct TaskArgumentAccessor { - TaskArgumentAccessor(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); - - Allocator get_allocator() const { - return get_gpu_memory_allocator(this->task); - } - - template - T const &get_argument(slot_id slot) const { - NOT_IMPLEMENTED(); - // TaskArgumentFormat arg_fmt = this->args_fmt.args.at(slot); - // std::type_index actual_type = arg_fmt.type; - // std::type_index requested_type = {typeid(T)}; - - // if (actual_type != requested_type) { - // throw mk_runtime_error( - // "Type mismatch in argument access (\"{}\" != \"{}\")", - // actual_type.name(), - // requested_type.name()); - // } - - // void *start_ptr = &((std::uint8_t *)this->task->args)[arg_fmt.start]; - // Legion::Deserializer dez(start_ptr, arg_fmt.start); - - // return ff_task_deserialize(dez); - } - - template - optional get_optional_argument(slot_id) const { - NOT_IMPLEMENTED(); - } - - template - std::vector get_variadic_argument(slot_id) const { - NOT_IMPLEMENTED(); - } - - template - privilege_mode_to_accessor - get_generic_accessor(region_idx_t const &idx) const { - auto tensor_privs = get_permissions(this->args_fmt, idx); - if (tensor_privs != PRIV) { - throw mk_runtime_error( - "Privilege mismatch while accessing tensor: {} != {}", - tensor_privs, - PRIV); - } - - return helperGetGenericTensorAccessor( - get_datatype(this->args_fmt, idx), - regions[idx.value()], - task->regions[idx.value()], - FID_DATA, - ctx, - runtime); - } - - template - privilege_mode_to_accessor get_tensor(slot_id slot) const { - auto argument_format = - get(this->args_fmt.region_idxs.at(slot)); - - return this->get_generic_accessor(argument_format); - } - - template - privilege_mode_to_accessor get_tensor_grad(slot_id slot) const { - NOT_IMPLEMENTED(); - } - - template - std::vector> - get_variadic_tensor(slot_id slot) const { - std::vector> result; - - auto argument_format = - get(this->args_fmt.region_idxs.at(slot)); - for (NonvariadicFormat const &argument : argument_format) { - result.push_back(this->get_generic_accessor(argument)); - } - - return result; - } - - template - std::vector> - get_variadic_tensor_grad(slot_id slot) const { - NOT_IMPLEMENTED(); - } - - template - T *unwrap(DeviceSpecific const &arg) const { - return arg.get(this->get_device_idx()); - } - - template - DeviceSpecific create_device_specific(Args &&...args) const { - return DeviceSpecific::create(this->get_device_idx(), - std::forward(args)...); - } - - size_t get_device_idx() const { - NOT_IMPLEMENTED(); - } - -private: - Legion::Task const *task; - std::vector const ®ions; - Legion::Context ctx; - Legion::Runtime *runtime; - TaskArgumentsFormat const &args_fmt; -}; - -} // namespace FlexFlow - -#endif From 905bdd1d1bf350e3da334c8b7eadaf664009f5ee Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 21 May 2024 19:16:43 -0700 Subject: [PATCH 09/24] Minor build issues --- lib/local-execution/include/runtime_arg_ref.h | 2 +- lib/utils/include/utils/type_index.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/local-execution/include/runtime_arg_ref.h b/lib/local-execution/include/runtime_arg_ref.h index 1493531dc3..078067150a 100644 --- a/lib/local-execution/include/runtime_arg_ref.h +++ b/lib/local-execution/include/runtime_arg_ref.h @@ -26,7 +26,7 @@ RuntimeArgRef> ff_handle() { return {RuntimeArgRefType::FF_HANDLE}; } -RuntimeArgRef> ff_handle() { +RuntimeArgRef> ff_iteration_config() { return {RuntimeArgRefType::FF_ITERATION_CONFIG}; } diff --git a/lib/utils/include/utils/type_index.h b/lib/utils/include/utils/type_index.h index 49e893faa0..134589e0aa 100644 --- a/lib/utils/include/utils/type_index.h +++ b/lib/utils/include/utils/type_index.h @@ -3,17 +3,18 @@ #include "fmt.h" #include +#include namespace FlexFlow { template -std::type_index type_index() { +std::type_index init_type_index() { return std::type_index(typeid(T)); } template bool matches(std::type_index idx) { - return idx == type_index(); + return idx == init_type_index(); } } // namespace FlexFlow From 3a3684e691f2e459d30bbcda4d21eeb60777fc24 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 22 May 2024 06:55:26 -0700 Subject: [PATCH 10/24] Build op task spec --- .../include/kernels/element_unary_kernels.h | 3 - lib/local-execution/include/op_arg_ref.h | 8 +- .../include/op_task_invocation.h | 33 +------- lib/local-execution/include/runtime_arg_ref.h | 14 +--- lib/local-execution/include/tasks.h | 2 +- .../include/variadic_tensor_ref.h | 4 +- lib/local-execution/src/op_arg_ref.cc | 14 ++++ lib/local-execution/src/op_task_invocation.cc | 75 ++++++++++++++----- lib/local-execution/src/ops/reduce.cc | 2 +- lib/local-execution/src/ops/replicate.cc | 4 +- .../src}/runtime_arg_ref.cc | 4 + .../src/variadic_tensor_ref.cc | 9 +++ .../include/op-attrs/get_output_shapes.h | 6 +- .../include/op-attrs/ops/element_unary.h | 3 + lib/op-attrs/include/op-attrs/ops/linear.h | 4 + lib/pcg/src/computation_graph.cc | 36 +++++---- 16 files changed, 129 insertions(+), 92 deletions(-) create mode 100644 lib/local-execution/src/op_arg_ref.cc rename lib/{runtime/src/task_spec => local-execution/src}/runtime_arg_ref.cc (72%) create mode 100644 lib/local-execution/src/variadic_tensor_ref.cc diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 17e0048c65..dedfbb01ef 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -9,9 +9,6 @@ namespace FlexFlow { -using ElementUnaryUnifiedAttrs = - std::variant; - struct ElementUnaryPerDeviceState { ffTensorDescriptor_t inputTensor, outputTensor; req actiDesc; diff --git a/lib/local-execution/include/op_arg_ref.h b/lib/local-execution/include/op_arg_ref.h index 02b354b221..577ac7984a 100644 --- a/lib/local-execution/include/op_arg_ref.h +++ b/lib/local-execution/include/op_arg_ref.h @@ -15,13 +15,9 @@ using OpArgRef = ArgRef; using OpArgRefSpec = ArgRefSpec; template -OpArgRef> per_device_op_state() { - return {OpArgRefType::PER_DEVICE_OP_STATE}; -} +OpArgRef> per_device_op_state(); -OpArgRef input_parallel_tensor_shape(int idx) { - return {OpArgRefType::PARALLEL_TENSOR_SHAPE}; -} +OpArgRef input_parallel_tensor_shape(int idx); } // namespace FlexFlow diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h index 2079fabcbc..ba35383641 100644 --- a/lib/local-execution/include/op_task_invocation.h +++ b/lib/local-execution/include/op_task_invocation.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H -#define _FLEXFLOW_RUNTIME_OP_TASK_SPEC_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H #include "concrete_arg.h" #include "kernels/accessor.h" @@ -108,34 +108,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskInvocation, OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd); OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); -bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) { - // tensors - auto tensor_bindings = inv.binding.get_tensor_bindings(); - for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - slot_id name = op_tensor_slot_spec.name; - IsGrad is_grad = op_tensor_slot_spec.is_grad; - std::pair tensor_key = std::make_pair(name, is_grad); - OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key); - if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role || - op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) { - return false; - } - } - - // args - auto sig_arg_types = sig.get_arg_types(); - OpArgSpecTypeAccessor type_accessor; - for (auto arg_binding : inv.binding.get_arg_bindings()) { - slot_id name = arg_binding.first; - OpArgSpec op_arg_spec = arg_binding.second; - std::type_index arg_type = sig_arg_types.at(name); - if (type_accessor(op_arg_spec) != arg_type) { - return false; - } - } - - return true; -} +bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv); } // namespace FlexFlow diff --git a/lib/local-execution/include/runtime_arg_ref.h b/lib/local-execution/include/runtime_arg_ref.h index 078067150a..05afa456cf 100644 --- a/lib/local-execution/include/runtime_arg_ref.h +++ b/lib/local-execution/include/runtime_arg_ref.h @@ -18,17 +18,9 @@ using RuntimeArgRef = ArgRef; using RuntimeArgRefSpec = ArgRefSpec; -RuntimeArgRef profiling_settings() { - return {RuntimeArgRefType::PROFILING_SETTINGS}; -} - -RuntimeArgRef> ff_handle() { - return {RuntimeArgRefType::FF_HANDLE}; -} - -RuntimeArgRef> ff_iteration_config() { - return {RuntimeArgRefType::FF_ITERATION_CONFIG}; -} +RuntimeArgRef profiling_settings(); +RuntimeArgRef> ff_handle(); +RuntimeArgRef> iteration_config(); } // namespace FlexFlow diff --git a/lib/local-execution/include/tasks.h b/lib/local-execution/include/tasks.h index d8fdc93f39..c78fefd4ea 100644 --- a/lib/local-execution/include/tasks.h +++ b/lib/local-execution/include/tasks.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H #define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H +#include #include #include -#include namespace FlexFlow { diff --git a/lib/local-execution/include/variadic_tensor_ref.h b/lib/local-execution/include/variadic_tensor_ref.h index 077c989c95..091c55b0af 100644 --- a/lib/local-execution/include/variadic_tensor_ref.h +++ b/lib/local-execution/include/variadic_tensor_ref.h @@ -11,9 +11,7 @@ enum class VariadicTensorRefType { INPUT_TENSORS }; template using VariadicTensorRef = ArgRef; -VariadicTensorRef get_input_tensors() { - return {VariadicTensorRefType::INPUT_TENSORS}; -} +VariadicTensorRef get_input_tensors(); } // namespace FlexFlow diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc new file mode 100644 index 0000000000..ba5d215cb5 --- /dev/null +++ b/lib/local-execution/src/op_arg_ref.cc @@ -0,0 +1,14 @@ +#include "op_arg_ref.h" + +namespace FlexFlow { + +template +OpArgRef> per_device_op_state() { + return {OpArgRefType::PER_DEVICE_OP_STATE}; +} + +OpArgRef input_parallel_tensor_shape(int idx) { + return {OpArgRefType::PARALLEL_TENSOR_SHAPE}; +} + +} \ No newline at end of file diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 5342e46b42..39e023e554 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -2,29 +2,21 @@ namespace FlexFlow { -OpTaskSignature get_signature(task_id_t const &) { - NOT_IMPLEMENTED(); +OpTensorSpec input_tensor(int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY) { + return {TensorRole::INPUT, option, idx}; } -OpTensorSpec::OpTensorSpec(TensorRole _role, int _idx) - : role(_role), idx(_idx) {} - -OpTensorSpec input_tensor(int idx) { - return {TensorRole::INPUT, idx}; -} - -OpTensorSpec output_tensor(int idx) { - return {TensorRole::OUTPUT, idx}; +OpTensorSpec output_tensor(int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY) { + return {TensorRole::OUTPUT, option, idx}; } -OpTensorSpec weight_tensor(int idx) { - return {TensorRole::WEIGHT, idx}; +OpTensorSpec weight_tensor(int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY) { + return {TensorRole::WEIGHT, option, idx}; } -// OpTaskBinding::OpTaskBinding() { -// this->serializer.reserve_bytes(sizeof(TaskArgumentFormat)); -// } - void OpTaskBinding::bind(slot_id slot, OpTensorSpec const &tensor_spec) { this->tensor_bindings.insert({{slot, IsGrad::NO}, tensor_spec}); } @@ -38,4 +30,53 @@ std::unordered_map, OpTensorSpec> const & return this->tensor_bindings; } +std::unordered_map const & + OpTaskBinding::get_arg_bindings() const { + return this->arg_bindings; +} + +OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) { + OpTaskBinding bwd; + bwd.bind_args_from_fwd(fwd); + bwd.bind_tensors_from_fwd(fwd); + for (auto const &[key, spec] : fwd.get_tensor_bindings()) { + OpSlotOptions slot_option = spec.slot_option; + if (slot_option != OpSlotOptions::UNTRAINABLE || + slot_option != OpSlotOptions::OPTIONAL_UNTRAINABLE) { + slot_id slot = key.first; + bwd.bind_grad(slot, spec); + } + } + return bwd; +} + +bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) { + // tensors + auto tensor_bindings = inv.binding.get_tensor_bindings(); + for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { + slot_id name = op_tensor_slot_spec.name; + IsGrad is_grad = op_tensor_slot_spec.is_grad; + std::pair tensor_key = std::make_pair(name, is_grad); + OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key); + if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role || + op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) { + return false; + } + } + + // args + auto sig_arg_types = sig.get_arg_types(); + OpArgSpecTypeAccessor type_accessor; + for (auto arg_binding : inv.binding.get_arg_bindings()) { + slot_id name = arg_binding.first; + OpArgSpec op_arg_spec = arg_binding.second; + std::type_index arg_type = sig_arg_types.at(name); + if (type_accessor(op_arg_spec) != arg_type) { + return false; + } + } + + return true; +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index d502a2b669..5228d15a61 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -20,7 +20,7 @@ enum Slots { HANDLE }; -OpTaskInvocation init(TransposeAttrs const &attrs) { +OpTaskInvocation init(ReduceAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(HANDLE, ff_handle()); diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index fa13766d9e..7e8cbac8c1 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -63,8 +63,8 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); return profile(backward_kernel, diff --git a/lib/runtime/src/task_spec/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc similarity index 72% rename from lib/runtime/src/task_spec/runtime_arg_ref.cc rename to lib/local-execution/src/runtime_arg_ref.cc index a0aa242ce6..a9d573bbb5 100644 --- a/lib/runtime/src/task_spec/runtime_arg_ref.cc +++ b/lib/local-execution/src/runtime_arg_ref.cc @@ -11,4 +11,8 @@ RuntimeArgRef> ff_handle() { return {RuntimeArgRefType::FF_HANDLE}; } +RuntimeArgRef> iteration_config() { + return {RuntimeArgRefType::FF_ITERATION_CONFIG}; +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc new file mode 100644 index 0000000000..e524f4d7a5 --- /dev/null +++ b/lib/local-execution/src/variadic_tensor_ref.cc @@ -0,0 +1,9 @@ +#include "variadic_tensor_ref.h" + +namespace FlexFlow { + +VariadicTensorRef get_input_tensors() { + return {VariadicTensorRefType::INPUT_TENSORS}; +} + +} \ No newline at end of file diff --git a/lib/op-attrs/include/op-attrs/get_output_shapes.h b/lib/op-attrs/include/op-attrs/get_output_shapes.h index 6fb93aac91..496cfbb755 100644 --- a/lib/op-attrs/include/op-attrs/get_output_shapes.h +++ b/lib/op-attrs/include/op-attrs/get_output_shapes.h @@ -128,9 +128,7 @@ ParallelTensorShape get_output_shape(DropoutAttrs const &, ParallelTensorShape get_output_shape(ElementBinaryAttrs const &, ParallelTensorShape const &, ParallelTensorShape const &); -ParallelTensorShape get_output_shape(ElementUnaryAttrs const &, - ParallelTensorShape const &); -ParallelTensorShape get_output_shape(ElementScalarUnaryAttrs const &, +ParallelTensorShape get_output_shape(ElementUnaryUnifiedAttrs const &, ParallelTensorShape const &); ParallelTensorShape get_output_shape(EmbeddingAttrs const &, ParallelTensorShape const &); @@ -153,6 +151,8 @@ ParallelTensorShape get_output_shape(RepartitionAttrs const &, ParallelTensorShape const &); ParallelTensorShape get_output_shape(ReplicateAttrs const &, ParallelTensorShape const &); +ParallelTensorShape get_output_shape(ReshapeAttrs const &, + ParallelTensorShape const &); ParallelTensorShape get_output_shape(ReverseAttrs const &, ParallelTensorShape const &); std::vector get_output_shapes(SplitAttrs const &, diff --git a/lib/op-attrs/include/op-attrs/ops/element_unary.h b/lib/op-attrs/include/op-attrs/ops/element_unary.h index 5e19b81c8c..6a80094dfa 100644 --- a/lib/op-attrs/include/op-attrs/ops/element_unary.h +++ b/lib/op-attrs/include/op-attrs/ops/element_unary.h @@ -21,6 +21,9 @@ struct ElementScalarUnaryAttrs { FF_VISITABLE_STRUCT(ElementScalarUnaryAttrs, op_type, scalar); CHECK_VALID_OP_ATTR(ElementScalarUnaryAttrs); +using ElementUnaryUnifiedAttrs = + std::variant; + } // namespace FlexFlow #endif diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h index a46df59282..3b57a959b8 100644 --- a/lib/op-attrs/include/op-attrs/ops/linear.h +++ b/lib/op-attrs/include/op-attrs/ops/linear.h @@ -36,7 +36,11 @@ CHECK_VALID_OP_ATTR(LinearAttrs); TensorShape get_weights_shape(LinearAttrs const &attrs, TensorShape const &input); +ParallelTensorShape get_weights_shape(LinearAttrs const &attrs, + ParallelTensorShape const &input); TensorShape get_bias_shape(LinearAttrs const &attrs, TensorShape const &input); +ParallelTensorShape get_bias_shape(LinearAttrs const &attrs, + ParallelTensorShape const &input); } // namespace FlexFlow diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc index d8a57311bf..8b7803c181 100644 --- a/lib/pcg/src/computation_graph.cc +++ b/lib/pcg/src/computation_graph.cc @@ -2,14 +2,16 @@ namespace FlexFlow { -std::vector traverse_comp_graph(ComputationGraph const & comp_graph) { +std::vector + traverse_comp_graph(ComputationGraph const &comp_graph) { std::vector layers = get_topological_ordering(comp_graph.value()); return transform(layers, [&](Node const &e) -> operator_guid_t { return operator_guid_t{e}; }); } -std::vector traverse_comp_graph_backwards(ComputationGraph const & comp_graph) { +std::vector + traverse_comp_graph_backwards(ComputationGraph const &comp_graph) { std::vector layers = reversed>(get_topological_ordering(comp_graph.value())); return transform(layers, [&](Node const &e) -> operator_guid_t { @@ -30,27 +32,30 @@ std::vector std::vector sorted_outputs(outputs.begin(), outputs.end()); sort(sorted_outputs.begin(), sorted_outputs.end(), src_edge_comparator); return transform(sorted_outputs, - [&](MultiDiOutput const &e) -> tensor_guid_t { - return tensor_guid_t{e}; - }); + [&](MultiDiOutput const &e) -> tensor_guid_t { + return tensor_guid_t{e}; + }); } -std::vector get_outgoing_tensors(ComputationGraph const & comp_graph, - operator_guid_t n) { +std::vector + get_outgoing_tensors(ComputationGraph const &comp_graph, + operator_guid_t n) { return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value())); } -std::vector get_incoming_tensors(ComputationGraph const & comp_graph, operator_guid_t n) { +std::vector + get_incoming_tensors(ComputationGraph const &comp_graph, + operator_guid_t n) { return sort_edge_set(get_incoming_edges(comp_graph.value(), n.value())); } -operator_guid_t add_node(ComputationGraph & comp_graph, Layer const &layer) { +operator_guid_t add_node(ComputationGraph &comp_graph, Layer const &layer) { Node added_node = comp_graph.value().add_node(layer); return operator_guid_t{added_node}; } -tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph, - operator_guid_t node, +tensor_guid_t create_outgoing_edge_with_label(ComputationGraph &comp_graph, + operator_guid_t node, int idx, Tensor tensor) { MultiDiOutput edge = {node.value(), NodePort{idx}}; @@ -58,8 +63,8 @@ tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph, return tensor_guid_t{edge}; } -void add_incoming_edges(ComputationGraph & comp_graph, -std::vector const &incoming_edges, +void add_incoming_edges(ComputationGraph &comp_graph, + std::vector const &incoming_edges, operator_guid_t node) { size_t incoming_edge_dst_port = 0; for (tensor_guid_t input : incoming_edges) { @@ -72,8 +77,9 @@ std::vector const &incoming_edges, } } -CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const & comp_graph, operator_guid_t const &n) { +CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph, + operator_guid_t const &n) { return comp_graph.at(n).attrs; } -} \ No newline at end of file +} // namespace FlexFlow From a4dd9d4c2ac3a5381b4e60a7b36b45584d6cf7b7 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 22 May 2024 11:44:50 -0700 Subject: [PATCH 11/24] Build ops and op task spec --- .../include/op_task_invocation.h | 14 +----- .../include/op_task_signature.h | 11 ++--- lib/local-execution/include/profiling.h | 4 -- lib/local-execution/src/op_arg_ref.cc | 2 +- lib/local-execution/src/op_task_invocation.cc | 41 +++++++++++----- lib/local-execution/src/op_task_signature.cc | 2 +- lib/local-execution/src/ops/attention.cc | 6 +-- lib/local-execution/src/ops/batch_matmul.cc | 3 +- lib/local-execution/src/ops/batch_norm.cc | 7 ++- lib/local-execution/src/ops/cast.cc | 3 +- lib/local-execution/src/ops/combine.cc | 3 +- lib/local-execution/src/ops/concat.cc | 4 +- lib/local-execution/src/ops/conv_2d.cc | 6 +-- lib/local-execution/src/ops/dropout.cc | 7 ++- lib/local-execution/src/ops/element_binary.cc | 6 +-- lib/local-execution/src/ops/element_unary.cc | 7 ++- lib/local-execution/src/ops/embedding.cc | 3 +- lib/local-execution/src/ops/flat.cc | 3 +- lib/local-execution/src/ops/layer_norm.cc | 24 +++++----- lib/local-execution/src/ops/linear.cc | 6 +-- lib/local-execution/src/ops/partition.cc | 6 +-- lib/local-execution/src/ops/pool_2d.cc | 6 +-- lib/local-execution/src/ops/reduce.cc | 6 +-- lib/local-execution/src/ops/reduction.cc | 3 +- lib/local-execution/src/ops/replicate.cc | 5 +- lib/local-execution/src/ops/reshape.cc | 6 +-- lib/local-execution/src/ops/reverse.cc | 3 +- lib/local-execution/src/ops/softmax.cc | 6 +-- lib/local-execution/src/ops/split.cc | 48 ++++++++++--------- lib/local-execution/src/ops/topk.cc | 6 +-- .../src/variadic_tensor_ref.cc | 2 +- 31 files changed, 113 insertions(+), 146 deletions(-) diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h index ba35383641..03cd19ed8e 100644 --- a/lib/local-execution/include/op_task_invocation.h +++ b/lib/local-execution/include/op_task_invocation.h @@ -25,14 +25,6 @@ enum class IsTrainable { YES, NO }; using OpArgSpec = std::variant; -struct OpArgSpecTypeAccessor { - std::type_index operator()(OpArgSpec &spec) { - return std::visit( - [](auto &&arg) -> std::type_index { return arg.get_type_index(); }, - spec); - } -}; - struct OpTaskBinding { OpTaskBinding() = default; @@ -101,14 +93,12 @@ struct OpTaskInvocation { task_id_t task_id; OpTaskBinding binding; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskInvocation, - task_id, - binding); +FF_VISITABLE_STRUCT(OpTaskInvocation, task_id, binding); OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd); OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); -bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv); +bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv); } // namespace FlexFlow diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/op_task_signature.h index 626266d10f..c4553df8a1 100644 --- a/lib/local-execution/include/op_task_signature.h +++ b/lib/local-execution/include/op_task_signature.h @@ -41,8 +41,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( OpTensorSlotSpec, name, slot_type, tensor_role, is_grad, slot_option); struct OpTaskSignature { - OpTaskSignature() = default; - // explicit OpTaskSignature(OpTaskType); + explicit OpTaskSignature(OpTaskType); OpTaskType get_task_type() const { return this->type; @@ -72,7 +71,6 @@ struct OpTaskSignature { template void add_return_value() { - // std::type_index return_value = init_type_index(); this->return_value = init_type_index(); } @@ -92,11 +90,8 @@ struct OpTaskSignature { std::unordered_map task_arg_types; std::unordered_set op_tensor_slots; }; -// FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(OpTaskSignature, -// type, -// return_value, -// task_arg_types, -// op_tensor_slots); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( + OpTaskSignature, type, return_value, task_arg_types, op_tensor_slots); template void register_task(task_id_t, diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h index 066cdc8404..c4ac1b7d02 100644 --- a/lib/local-execution/include/profiling.h +++ b/lib/local-execution/include/profiling.h @@ -12,10 +12,6 @@ std::optional profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) { std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); - // TODO -- local logger? - // if (elapsed.has_value()) { - // log_profile.debug(s, elapsed.value()); - // } return elapsed; } diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc index ba5d215cb5..6bea26a5a2 100644 --- a/lib/local-execution/src/op_arg_ref.cc +++ b/lib/local-execution/src/op_arg_ref.cc @@ -11,4 +11,4 @@ OpArgRef input_parallel_tensor_shape(int idx) { return {OpArgRefType::PARALLEL_TENSOR_SHAPE}; } -} \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 39e023e554..94504840c0 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -50,28 +50,38 @@ OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) { return bwd; } -bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) { - // tensors +bool is_op_tensor_spec_invalid(OpTensorSlotSpec tensor_slot_spec, + OpTensorSpec tensor_spec) { + return tensor_spec.role != tensor_slot_spec.tensor_role || + tensor_spec.slot_option != tensor_slot_spec.slot_option; +} + +bool is_tensor_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { auto tensor_bindings = inv.binding.get_tensor_bindings(); for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - slot_id name = op_tensor_slot_spec.name; - IsGrad is_grad = op_tensor_slot_spec.is_grad; - std::pair tensor_key = std::make_pair(name, is_grad); + std::pair tensor_key = + std::make_pair(op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad); OpTensorSpec const &op_tensor_spec = tensor_bindings.at(tensor_key); - if (op_tensor_spec.role != op_tensor_slot_spec.tensor_role || - op_tensor_spec.slot_option != op_tensor_slot_spec.slot_option) { + if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { return false; } } + return true; +} - // args +bool is_arg_type_invalid(std::type_index expected_arg_type, + OpArgSpec op_arg_spec) { + std::type_index arg_spec_type = std::visit( + [](auto &&arg) -> std::type_index { return arg.get_type_index(); }, + op_arg_spec); + return arg_spec_type != expected_arg_type; +} + +bool is_arg_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { auto sig_arg_types = sig.get_arg_types(); - OpArgSpecTypeAccessor type_accessor; for (auto arg_binding : inv.binding.get_arg_bindings()) { - slot_id name = arg_binding.first; - OpArgSpec op_arg_spec = arg_binding.second; - std::type_index arg_type = sig_arg_types.at(name); - if (type_accessor(op_arg_spec) != arg_type) { + std::type_index arg_type = sig_arg_types.at(arg_binding.first); + if (is_arg_type_invalid(arg_type, arg_binding.second)) { return false; } } @@ -79,4 +89,9 @@ bool validate_invocation(OpTaskSignature sig, OpTaskInvocation inv) { return true; } +bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { + return is_tensor_invocation_valid(sig, inv) && + is_arg_invocation_valid(sig, inv); +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc index bc3eaa12db..71642680a6 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/local-execution/src/op_task_signature.cc @@ -2,7 +2,7 @@ namespace FlexFlow { -// OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){}; +OpTaskSignature::OpTaskSignature(OpTaskType t) : type(t){}; void OpTaskSignature::add_input_slot(slot_id name, SlotType slot_type) { OpTensorSlotSpec op_tensor_slot_spec = { diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 854213a955..414b71ec70 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -277,8 +277,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_arg_slot(QUERY_PARALLEL_TENSOR_SHAPE); init.add_arg_slot(KEY_PARALLEL_TENSOR_SHAPE); init.add_arg_slot(VALUE_PARALLEL_TENSOR_SHAPE); @@ -309,8 +308,7 @@ OpTaskSignature get_signature() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(QUERY); fwd.add_input_slot(KEY); diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index c5df564afd..eccbe5a475 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -187,8 +187,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(A_INPUT); fwd.add_input_slot(B_INPUT); diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index dadfab14e0..5e640d70e0 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -189,8 +189,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); + init.add_input_slot(INPUT); init.add_input_slot(BIAS); init.add_output_slot(OUTPUT); @@ -211,8 +211,7 @@ void register_task() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_input_slot(SCALE); diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 0914ea40a6..5647d7e7f2 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -107,8 +107,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index 942d964021..0bce55722a 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -84,8 +84,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 3d62c19f20..087f08b577 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -111,8 +111,8 @@ CostMetrics template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); + fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUTS, SlotType::VARIADIC); diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index 0df15e9b23..a53b259fac 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -178,8 +178,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); init.add_output_slot(OUTPUT); @@ -202,8 +201,7 @@ void register_task() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index 236b7e2c88..4935091ee5 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -124,8 +124,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); + init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(FF_HANDLE); init.add_output_slot(OUTPUT); @@ -145,8 +145,7 @@ void register_task() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc index 0cec2b8d0a..b5588e04fd 100644 --- a/lib/local-execution/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -173,8 +173,7 @@ CostMetrics template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(LHS_INPUT); init.add_input_slot(RHS_INPUT); @@ -197,8 +196,7 @@ void register_task() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index 9567fc1570..ddec57414a 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -150,8 +150,8 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); + init.add_arg_slot(INPUT_SHAPE); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); @@ -171,8 +171,7 @@ void register_task() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc index 31dc83814f..bac48c4b24 100644 --- a/lib/local-execution/src/ops/embedding.cc +++ b/lib/local-execution/src/ops/embedding.cc @@ -120,8 +120,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_input_slot(OUTPUT); diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 45d3805e0c..9849bd3b73 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -82,8 +82,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index 3caf95c068..83d04b893f 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -133,15 +133,16 @@ static DeviceSpecific num_replicas *= input.shape.at(legion_dim_t(i)); effective_num_elements = M; effective_batch_size = input.shape.get_volume() / M; - - DeviceSpecific per_device_state = - init_kernel(handle, - allocator, - attrs.elementwise_affine, - effective_batch_size, - effective_num_elements, - attrs.eps); } + + DeviceSpecific per_device_state = + init_kernel(handle, + allocator, + attrs.elementwise_affine, + effective_batch_size, + effective_num_elements, + attrs.eps); + return per_device_state; } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -186,8 +187,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); @@ -208,8 +208,8 @@ OpTaskSignature bwd_signature() { template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); + init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 2d13909c09..08e8fa3f68 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -211,8 +211,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> OpTaskSignature init_signature() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); init.add_weight_slot(WEIGHT); @@ -227,8 +226,7 @@ OpTaskSignature init_signature() { template <> OpTaskSignature fwd_signature() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_weight_slot(WEIGHT); diff --git a/lib/local-execution/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc index c6e5bce64d..1d358b52f5 100644 --- a/lib/local-execution/src/ops/partition.cc +++ b/lib/local-execution/src/ops/partition.cc @@ -135,8 +135,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_unchecked_arg_slot(HANDLE); @@ -150,8 +149,7 @@ void register_task() { template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 32bc5d1616..576a5a8d23 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -182,8 +182,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_input_slot(INPUT); init.add_output_slot(OUTPUT); @@ -198,8 +197,7 @@ void register_task() { template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index 5228d15a61..0ccd7be6e3 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -49,8 +49,7 @@ static DeviceSpecific template <> void register_task() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); @@ -92,8 +91,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index 31b3e2458d..86f300df63 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -104,8 +104,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_arg_slot(ATTRS); diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 7e8cbac8c1..3322f8a1ce 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -72,7 +72,7 @@ static std::optional "[replicate] backward_time = %.2lfms\n", input_grad, output_grad, - attrs.replicate_degree); // is this `num_replicas`? + attrs.replicate_degree); } CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, @@ -100,8 +100,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 2b3200d79d..c53fe5d78b 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -126,8 +126,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_arg_slot(ATTRS); @@ -138,8 +137,7 @@ void register_task() { template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 6c28966e6e..49f1e51076 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -133,8 +133,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_input_slot(INPUT); diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 054b3bc7db..5a65127140 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -138,8 +138,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); @@ -150,8 +149,7 @@ void register_task() { template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index 3661d6e074..ffb40515ad 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -44,17 +44,17 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { return {SPLIT_BWD_TASK_ID, binding}; } -void calc_block_size(coord_t &num_blks, - coord_t &blk_size, +void calc_block_size(coord_t &num_blocks, + coord_t &block_size, ArrayShape const &array_shape, int axis) { - num_blks = 1; - blk_size = 1; + num_blocks = 1; + block_size = 1; for (int d = 0; d < array_shape.num_elements(); d++) { if (d <= axis) { - blk_size *= array_shape.at(legion_dim_t(d)); + block_size *= array_shape.at(legion_dim_t(d)); } else { - num_blks *= array_shape.at(legion_dim_t(d)); + num_blocks *= array_shape.at(legion_dim_t(d)); } } } @@ -65,13 +65,13 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, input.shape, attrs.axis.value()); + coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS]; + calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis.value()); for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blks; + coord_t out_num_blocks; calc_block_size( - out_num_blks, out_blk_size[i], output.shape, attrs.axis.value()); + out_num_blocks, out_block_size[i], output.shape, attrs.axis.value()); } float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, @@ -79,9 +79,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { "Split forward_time = %.2lfms\n", &output_float_ptr, input.get_float_ptr(), - out_blk_size, - in_blk_size, - num_blks, + out_block_size, + in_block_size, + num_blocks, attrs.splits.size()); } @@ -93,12 +93,15 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t num_blks, in_blk_size, out_blk_size[MAX_NUM_OUTPUTS]; - calc_block_size(num_blks, in_blk_size, input_grad.shape, attrs.axis.value()); + coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS]; + calc_block_size( + num_blocks, in_block_size, input_grad.shape, attrs.axis.value()); for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blks; - calc_block_size( - out_num_blks, out_blk_size[i], output_grad.shape, attrs.axis.value()); + coord_t out_num_blocks; + calc_block_size(out_num_blocks, + out_block_size[i], + output_grad.shape, + attrs.axis.value()); } float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, @@ -106,9 +109,9 @@ static std::optional "Split backward_time = %.2lfms\n", input_grad.get_float_ptr(), &output_grad_ptr, - out_blk_size, - in_blk_size, - num_blks, + out_block_size, + in_block_size, + num_blocks, attrs.splits.size()); } @@ -143,8 +146,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index 5fb2c6842f..f6783a2d6c 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -157,8 +157,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, template <> void register_task() { - OpTaskSignature init; - init.type = OpTaskType::INIT; + OpTaskSignature init(OpTaskType::INIT); init.add_arg_slot(ATTRS); // Note: this may have some question init.add_return_value(); @@ -167,8 +166,7 @@ void register_task() { template <> void register_task() { - OpTaskSignature fwd; - fwd.type = OpTaskType::FWD; + OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_arg_slot(ATTRS); diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc index e524f4d7a5..74d0f0d9e7 100644 --- a/lib/local-execution/src/variadic_tensor_ref.cc +++ b/lib/local-execution/src/variadic_tensor_ref.cc @@ -6,4 +6,4 @@ VariadicTensorRef get_input_tensors() { return {VariadicTensorRefType::INPUT_TENSORS}; } -} \ No newline at end of file +} // namespace FlexFlow From 5bc719f893bf65252e8d56f3677f4742b6477e15 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 22 May 2024 12:52:50 -0700 Subject: [PATCH 12/24] Simplify edge set obtain --- lib/pcg/include/pcg/computation_graph.h | 22 +++++----- lib/pcg/src/computation_graph.cc | 55 +++++++++++------------- lib/pcg/src/computation_graph_builder.cc | 19 ++++---- 3 files changed, 46 insertions(+), 50 deletions(-) diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 53aa7eb820..c051fcc8c3 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -36,22 +36,22 @@ struct ComputationGraph CHECK_WELL_BEHAVED_VALUE_TYPE_NO_HASH(ComputationGraph); std::vector - traverse_comp_graph(ComputationGraph const &comp_graph); + traverse_comp_graph_forward(ComputationGraph const &comp_graph); std::vector - traverse_comp_graph_backwards(ComputationGraph const &comp_graph); + traverse_comp_graph_backward(ComputationGraph const &comp_graph); std::vector get_outgoing_tensors(ComputationGraph const &comp_graph, operator_guid_t n); std::vector get_incoming_tensors(ComputationGraph const &comp_graph, operator_guid_t n); -operator_guid_t add_node(ComputationGraph &comp_graph, Layer const &layer); -tensor_guid_t create_outgoing_edge_with_label(ComputationGraph &comp_graph, - operator_guid_t node, - int idx, - Tensor tensor); - -void add_incoming_edges(ComputationGraph &comp_graph, - std::vector const &incoming_edges, - operator_guid_t node); +operator_guid_t create_node(ComputationGraph &comp_graph, Layer const &layer); +tensor_guid_t create_outgoing_edge(ComputationGraph &comp_graph, + operator_guid_t node, + int idx, + Tensor tensor); + +void connect_incoming_edges(ComputationGraph &comp_graph, + std::vector const &incoming_edges, + operator_guid_t node); CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph, operator_guid_t const &n); diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc index d8a57311bf..18fded6d3e 100644 --- a/lib/pcg/src/computation_graph.cc +++ b/lib/pcg/src/computation_graph.cc @@ -2,14 +2,16 @@ namespace FlexFlow { -std::vector traverse_comp_graph(ComputationGraph const & comp_graph) { +std::vector + traverse_comp_graph_forward(ComputationGraph const &comp_graph) { std::vector layers = get_topological_ordering(comp_graph.value()); return transform(layers, [&](Node const &e) -> operator_guid_t { return operator_guid_t{e}; }); } -std::vector traverse_comp_graph_backwards(ComputationGraph const & comp_graph) { +std::vector + traverse_comp_graph_backward(ComputationGraph const &comp_graph) { std::vector layers = reversed>(get_topological_ordering(comp_graph.value())); return transform(layers, [&](Node const &e) -> operator_guid_t { @@ -17,50 +19,44 @@ std::vector traverse_comp_graph_backwards(ComputationGraph cons }); } -bool src_edge_comparator(MultiDiOutput x, MultiDiOutput y) { - return x.src_idx < y.src_idx; -} - std::vector sort_edge_set(std::unordered_set edges) { - std::unordered_set outputs = - transform(edges, [&](MultiDiEdge const &e) -> MultiDiOutput { - return MultiDiOutput(e); - }); - std::vector sorted_outputs(outputs.begin(), outputs.end()); - sort(sorted_outputs.begin(), sorted_outputs.end(), src_edge_comparator); - return transform(sorted_outputs, - [&](MultiDiOutput const &e) -> tensor_guid_t { - return tensor_guid_t{e}; - }); + return transform( + sorted_by(edges, compare_by([](MultiDiEdge const &e) { + return e.src_idx; + })), + [&](MultiDiEdge const &e) -> tensor_guid_t { return tensor_guid_t{e}; }); } -std::vector get_outgoing_tensors(ComputationGraph const & comp_graph, - operator_guid_t n) { +std::vector + get_outgoing_tensors(ComputationGraph const &comp_graph, + operator_guid_t n) { return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value())); } -std::vector get_incoming_tensors(ComputationGraph const & comp_graph, operator_guid_t n) { +std::vector + get_incoming_tensors(ComputationGraph const &comp_graph, + operator_guid_t n) { return sort_edge_set(get_incoming_edges(comp_graph.value(), n.value())); } -operator_guid_t add_node(ComputationGraph & comp_graph, Layer const &layer) { +operator_guid_t create_node(ComputationGraph &comp_graph, Layer const &layer) { Node added_node = comp_graph.value().add_node(layer); return operator_guid_t{added_node}; } -tensor_guid_t create_outgoing_edge_with_label(ComputationGraph & comp_graph, - operator_guid_t node, - int idx, - Tensor tensor) { +tensor_guid_t create_outgoing_edge(ComputationGraph &comp_graph, + operator_guid_t node, + int idx, + Tensor tensor) { MultiDiOutput edge = {node.value(), NodePort{idx}}; comp_graph.value().add_output(edge, tensor); return tensor_guid_t{edge}; } -void add_incoming_edges(ComputationGraph & comp_graph, -std::vector const &incoming_edges, - operator_guid_t node) { +void connect_incoming_edges(ComputationGraph &comp_graph, + std::vector const &incoming_edges, + operator_guid_t node) { size_t incoming_edge_dst_port = 0; for (tensor_guid_t input : incoming_edges) { MultiDiOutput input_view = input.value(); @@ -72,8 +68,9 @@ std::vector const &incoming_edges, } } -CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const & comp_graph, operator_guid_t const &n) { +CompGraphOperatorAttrs get_layer_attrs(ComputationGraph const &comp_graph, + operator_guid_t const &n) { return comp_graph.at(n).attrs; } -} \ No newline at end of file +} // namespace FlexFlow diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc index 78e49f0695..f237232a76 100644 --- a/lib/pcg/src/computation_graph_builder.cc +++ b/lib/pcg/src/computation_graph_builder.cc @@ -12,13 +12,12 @@ tensor_guid_t ComputationGraphBuilder::add_layer( std::vector>> const &weight_shapes, TensorShape const &output_shape) { - operator_guid_t node = add_node(computation_graph, layer); - add_incoming_edges(computation_graph, inputs, node); - return create_outgoing_edge_with_label( - computation_graph, - node, - 0, - construct_tensor_from_output_shape(output_shape)); + operator_guid_t node = create_node(computation_graph, layer); + connect_incoming_edges(computation_graph, inputs, node); + return create_outgoing_edge(computation_graph, + node, + 0, + construct_tensor_from_output_shape(output_shape)); } std::vector ComputationGraphBuilder::add_layer( @@ -27,11 +26,11 @@ std::vector ComputationGraphBuilder::add_layer( std::vector>> const &weight_shapes, std::vector const &output_shapes) { - operator_guid_t node = add_node(computation_graph, layer); - add_incoming_edges(computation_graph, inputs, node); + operator_guid_t node = create_node(computation_graph, layer); + connect_incoming_edges(computation_graph, inputs, node); std::vector output_tensor_guids; for (int i = 0; i < output_shapes.size(); ++i) { - output_tensor_guids.push_back(create_outgoing_edge_with_label( + output_tensor_guids.push_back(create_outgoing_edge( computation_graph, node, i, From 583b2d30ca780c9ac2e6fbcc391869f33b2ab2b9 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 22 May 2024 13:02:05 -0700 Subject: [PATCH 13/24] Format --- lib/pcg/src/computation_graph.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pcg/src/computation_graph.cc b/lib/pcg/src/computation_graph.cc index 1508ad2d41..18fded6d3e 100644 --- a/lib/pcg/src/computation_graph.cc +++ b/lib/pcg/src/computation_graph.cc @@ -29,7 +29,7 @@ std::vector } std::vector - get_outgoing_tensors(ComputationGraph const &comp_graph, + get_outgoing_tensors(ComputationGraph const &comp_graph, operator_guid_t n) { return sort_edge_set(get_outgoing_edges(comp_graph.value(), n.value())); } From 269557e1fb43a1a5c66dacdbdcc771eef9dd04b4 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Mon, 27 May 2024 22:08:34 -0700 Subject: [PATCH 14/24] Fixes --- .../include/op_task_invocation.h | 9 --------- lib/local-execution/include/op_task_signature.h | 5 +++-- lib/local-execution/src/op_task_invocation.cc | 17 ++++++++++++----- lib/local-execution/src/ops/layer_norm.cc | 13 ++++--------- 4 files changed, 19 insertions(+), 25 deletions(-) diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h index 03cd19ed8e..4c753ec43c 100644 --- a/lib/local-execution/include/op_task_invocation.h +++ b/lib/local-execution/include/op_task_invocation.h @@ -8,7 +8,6 @@ #include "op_tensor_spec.h" #include "profiling.h" #include "runtime_arg_ref.h" -#include "serialization.h" #include "tasks.h" #include "utils/bidict.h" #include "utils/stack_map.h" @@ -59,14 +58,6 @@ struct OpTaskBinding { this->insert_arg_spec(name, OpArgRefSpec::create(ref)); } - void bind_args_from_fwd(OpTaskBinding const &fwd) { - this->arg_bindings = fwd.get_arg_bindings(); - } - - void bind_tensors_from_fwd(OpTaskBinding const &fwd) { - this->tensor_bindings = fwd.get_tensor_bindings(); - } - std::unordered_map, OpTensorSpec> const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/op_task_signature.h index c4553df8a1..191c83d287 100644 --- a/lib/local-execution/include/op_task_signature.h +++ b/lib/local-execution/include/op_task_signature.h @@ -41,6 +41,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION( OpTensorSlotSpec, name, slot_type, tensor_role, is_grad, slot_option); struct OpTaskSignature { + OpTaskSignature() = delete; explicit OpTaskSignature(OpTaskType); OpTaskType get_task_type() const { @@ -81,9 +82,9 @@ struct OpTaskSignature { this->task_arg_types.insert({name, init_type_index()}); } - std::unordered_set get_tensor_slots(); + std::unordered_set get_tensor_slots() const; void set_arg_types(std::unordered_map const &); - std::unordered_map get_arg_types(); + std::unordered_map get_arg_types() const; OpTaskType type; std::optional return_value; diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 94504840c0..31fc0b2da2 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -37,8 +37,8 @@ std::unordered_map const & OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) { OpTaskBinding bwd; - bwd.bind_args_from_fwd(fwd); - bwd.bind_tensors_from_fwd(fwd); + bwd.arg_bindings = fwd.get_arg_bindings(); + bwd.tensor_bindings = fwd.get_tensor_bindings(); for (auto const &[key, spec] : fwd.get_tensor_bindings()) { OpSlotOptions slot_option = spec.slot_option; if (slot_option != OpSlotOptions::UNTRAINABLE || @@ -56,7 +56,8 @@ bool is_op_tensor_spec_invalid(OpTensorSlotSpec tensor_slot_spec, tensor_spec.slot_option != tensor_slot_spec.slot_option; } -bool is_tensor_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { +bool is_tensor_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv) { auto tensor_bindings = inv.binding.get_tensor_bindings(); for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { std::pair tensor_key = @@ -77,7 +78,8 @@ bool is_arg_type_invalid(std::type_index expected_arg_type, return arg_spec_type != expected_arg_type; } -bool is_arg_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { +bool is_arg_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv) { auto sig_arg_types = sig.get_arg_types(); for (auto arg_binding : inv.binding.get_arg_bindings()) { std::type_index arg_type = sig_arg_types.at(arg_binding.first); @@ -89,9 +91,14 @@ bool is_arg_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { return true; } -bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv) { +bool is_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv) { return is_tensor_invocation_valid(sig, inv) && is_arg_invocation_valid(sig, inv); } +bool are_sigs_eq(OpTaskSignature const &sig1, OpTaskSignature const &sig2) { + return sig1 == sig2; +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index 83d04b893f..fb97f946eb 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -29,13 +29,9 @@ using namespace FlexFlow::Kernels::LayerNorm; enum Slots { PROFILING, INPUT, - INPUT_GRAD, OUTPUT, - OUTPUT_GRAD, GAMMA, - GAMMA_GRAD, BETA, - BETA_GRAD, PER_DEVICE_STATE, ATTRS, HANDLE @@ -95,10 +91,10 @@ static std::optional auto input = acc.get_tensor(INPUT); auto gamma = acc.get_tensor(GAMMA); - auto input_grad = acc.get_tensor(INPUT_GRAD); - auto gamma_grad = acc.get_tensor(GAMMA_GRAD); - auto beta_grad = acc.get_tensor(BETA_GRAD); - auto output_grad = acc.get_tensor(OUTPUT_GRAD); + auto input_grad = acc.get_tensor_grad(INPUT); + auto gamma_grad = acc.get_tensor_grad(GAMMA); + auto beta_grad = acc.get_tensor_grad(BETA); + auto output_grad = acc.get_tensor_grad(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); auto &state = acc.get_argument(PER_DEVICE_STATE); @@ -170,7 +166,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); - // TODO how to handle gamma and beta, where are they from fwd_binding.bind(GAMMA, input.shape); fwd_binding.bind(BETA, input.shape); SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); From 269770a43214ecddb796210601a3bd4e1b35271e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 29 May 2024 22:24:35 -0700 Subject: [PATCH 15/24] Fix conflicts, some renaming --- lib/local-execution/CMakeLists.txt | 1 + lib/local-execution/include/arg_ref.h | 4 ++-- lib/local-execution/include/concrete_arg.h | 2 +- lib/local-execution/include/local_allocator.h | 5 ----- lib/local-execution/include/op_task_invocation.h | 3 ++- lib/local-execution/include/op_task_signature.h | 6 +++--- lib/local-execution/include/profiling.h | 4 ++++ lib/local-execution/src/op_task_invocation.cc | 4 ---- lib/local-execution/src/tracked_allocator.cc | 15 --------------- lib/utils/include/utils/type_index.h | 4 ++-- 10 files changed, 15 insertions(+), 33 deletions(-) diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt index ee1d8fecdc..6b432fad75 100644 --- a/lib/local-execution/CMakeLists.txt +++ b/lib/local-execution/CMakeLists.txt @@ -12,4 +12,5 @@ ff_add_library( utils kernels pcg + spdlog ) \ No newline at end of file diff --git a/lib/local-execution/include/arg_ref.h b/lib/local-execution/include/arg_ref.h index 67e8a47404..b0e2b57b05 100644 --- a/lib/local-execution/include/arg_ref.h +++ b/lib/local-execution/include/arg_ref.h @@ -42,13 +42,13 @@ struct ArgRefSpec { static ArgRefSpec create(ArgRef const &r) { static_assert(is_serializable::value, "Type must be serializeable"); - return ArgRefSpec(init_type_index(), r.ref_type); + return ArgRefSpec(get_type_index_for_type(), r.ref_type); } template static ArgRefSpec create_device_specific(ArgRef const &r, size_t device_idx) { - return ArgRefSpec(init_type_index(), r.ref_type, device_idx); + return ArgRefSpec(get_type_index_for_type(), r.ref_type, device_idx); } private: diff --git a/lib/local-execution/include/concrete_arg.h b/lib/local-execution/include/concrete_arg.h index 522d21485e..072500f47e 100644 --- a/lib/local-execution/include/concrete_arg.h +++ b/lib/local-execution/include/concrete_arg.h @@ -31,7 +31,7 @@ struct ConcreteArgSpec { static ConcreteArgSpec create(T const &t) { static_assert(is_serializable::value, "Type must be serializable"); - std::type_index type_idx = init_type_index(); + std::type_index type_idx = get_type_index_for_type(); std::shared_ptr ptr = std::static_pointer_cast(std::make_shared(t)); diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local_allocator.h index 85e54d09f5..b47220eb8c 100644 --- a/lib/local-execution/include/local_allocator.h +++ b/lib/local-execution/include/local_allocator.h @@ -1,10 +1,5 @@ -<<<<<<< op-refactor #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ALLOCATOR_H -======= -#ifndef _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H -#define _FLEXFLOW_RUNTIME_SRC_LOCAL_ALLOCATOR_H ->>>>>>> repo-refactor #include "kernels/allocation.h" #include diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/op_task_invocation.h index 4c753ec43c..1bf94a1b0d 100644 --- a/lib/local-execution/include/op_task_invocation.h +++ b/lib/local-execution/include/op_task_invocation.h @@ -89,7 +89,8 @@ FF_VISITABLE_STRUCT(OpTaskInvocation, task_id, binding); OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd); OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); -bool is_invocation_valid(OpTaskSignature sig, OpTaskInvocation inv); +bool is_invocation_valid(OpTaskSignature const &sig, + OpTaskInvocation const &inv); } // namespace FlexFlow diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/op_task_signature.h index 191c83d287..840c321627 100644 --- a/lib/local-execution/include/op_task_signature.h +++ b/lib/local-execution/include/op_task_signature.h @@ -67,19 +67,19 @@ struct OpTaskSignature { template void add_arg_slot(slot_id name) { static_assert(is_serializable::value, "Type must be serializable"); - this->task_arg_types.insert({name, init_type_index()}); + this->task_arg_types.insert({name, get_type_index_for_type()}); } template void add_return_value() { - this->return_value = init_type_index(); + this->return_value = get_type_index_for_type(); } // adds arg_slot without checking is_serializable, used for arguments that are // deviceSpecific template void add_unchecked_arg_slot(slot_id name) { - this->task_arg_types.insert({name, init_type_index()}); + this->task_arg_types.insert({name, get_type_index_for_type()}); } std::unordered_set get_tensor_slots() const; diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h index c4ac1b7d02..f3c0e36cc1 100644 --- a/lib/local-execution/include/profiling.h +++ b/lib/local-execution/include/profiling.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_PROFILING_H #include "kernels/profiling.h" +#include "spdlog/spdlog.h" namespace FlexFlow { @@ -12,6 +13,9 @@ std::optional profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) { std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); + if (elapsed.has_value()) { + spdlog::debug(elapsed.value()); + } return elapsed; } diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 31fc0b2da2..5683cb12ec 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -97,8 +97,4 @@ bool is_invocation_valid(OpTaskSignature const &sig, is_arg_invocation_valid(sig, inv); } -bool are_sigs_eq(OpTaskSignature const &sig1, OpTaskSignature const &sig2) { - return sig1 == sig2; -} - } // namespace FlexFlow diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 6fc2412836..6d06714252 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -3,16 +3,10 @@ namespace FlexFlow { -<<<<<<< op-refactor -void *TrackedAllocator::allocate(size_t requested_memory_size) { - void *ptr; - checkCUDA(cudaMalloc(&ptr, requested_memory_size)); -======= TrackedAllocator::TrackedAllocator(Allocator a) : allocator(a) {} void *TrackedAllocator::allocate(size_t requested_memory_size) { void *ptr = this->allocator.allocate(requested_memory_size); ->>>>>>> repo-refactor this->current_mem_usage += requested_memory_size; return ptr; } @@ -20,11 +14,7 @@ void *TrackedAllocator::allocate(size_t requested_memory_size) { void TrackedAllocator::deallocate(void *ptr) { size_t psize; checkCUDA(cudaGetSymbolSize(&psize, ptr)); -<<<<<<< op-refactor - checkCUDA(cudaFree(ptr)); -======= this->allocator.deallocate(ptr); ->>>>>>> repo-refactor this->current_mem_usage -= psize; } @@ -32,13 +22,8 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } -<<<<<<< op-refactor -Allocator get_tracked_memory_allocator() { - return Allocator::create(); -======= Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { return Allocator::create(base_allocator); ->>>>>>> repo-refactor } } // namespace FlexFlow diff --git a/lib/utils/include/utils/type_index.h b/lib/utils/include/utils/type_index.h index 134589e0aa..77a377a48d 100644 --- a/lib/utils/include/utils/type_index.h +++ b/lib/utils/include/utils/type_index.h @@ -8,13 +8,13 @@ namespace FlexFlow { template -std::type_index init_type_index() { +std::type_index get_type_index_for_type() { return std::type_index(typeid(T)); } template bool matches(std::type_index idx) { - return idx == init_type_index(); + return idx == get_type_index_for_type(); } } // namespace FlexFlow From 2fbf2911e9e3bb51e9138231c770594384d86aa3 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 30 May 2024 15:49:22 -0700 Subject: [PATCH 16/24] Fix gather kernels --- lib/kernels/include/kernels/gather_kernels.h | 22 ++- lib/kernels/src/cuda/ops/gather_kernels.cu | 177 +++++++++---------- lib/local-execution/include/profiling.h | 2 +- 3 files changed, 98 insertions(+), 103 deletions(-) diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index c74f9c0bb6..305ccc8e26 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -2,36 +2,34 @@ #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H #include "accessor.h" -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { struct GatherPerDeviceState { + PerDeviceFFHandle handle; int legion_dim; - req index_data_type; }; + FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, - legion_dim, - index_data_type); + handle, + legion_dim); namespace Kernels { namespace Gather { + void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &output, - size_t stride, - size_t input_dim_size, - size_t output_dim_size); + GenericTensorAccessorW const &output); + void backward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad, - size_t stride, - size_t input_dim_size, - size_t output_dim_size); + GenericTensorAccessorW const &input_grad); + } // namespace Gather } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 37d0112eab..829c952a49 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -25,25 +25,25 @@ template __global__ void gather_forward(float const *input, IndexType const *index, float *output, - size_t output_size, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { CUDA_KERNEL_LOOP(o, output_size) { // output tensor shape: [*, output_dim_size, stride] // output tensor stride: [output_dim_size * stride, stride, 1] - // output tensor index: [outer_index, index_2, left_over] + // output tensor index: [outter_index, index_2, left_over] // input tensor shape: [*, input_dim_size, stride] // input tensor stride: [input_dim_size * stride, stride, 1] // the index of the corresponding input tensor should be: - // [outer_index, index[0], left_over] - // Therefore, input_index = outer_index * (stride * input_dim_size) + // [outter_index, index[0], left_over] + // Therefore, input_index = outter_index * (stride * input_dim_size) // + index[0] * stride + left_over; - size_t outer_index = o / (stride * output_dim_size); + coord_t outter_index = o / (stride * output_dim_size); // coord_t index_2 = (o / stride) % dim_size - size_t left_over = o % stride; - size_t input_idx = - outer_index * (stride * input_dim_size) + index[o] * stride + left_over; + coord_t left_over = o % stride; + coord_t input_idx = outter_index * (stride * input_dim_size) + + index[o] * stride + left_over; output[o] = input[input_idx]; } } @@ -52,24 +52,24 @@ template __global__ void gather_backward(float const *output_grad, IndexType const *index, float *input_grad, - size_t output_size, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { CUDA_KERNEL_LOOP(o, output_size) { // output tensor shape: [*, output_dim_size, stride] // output tensor stride: [output_dim_size * stride, stride, 1] - // output tensor index: [outer_index, index_2, left_over] + // output tensor index: [outter_index, index_2, left_over] // input tensor shape: [*, input_dim_size, stride] // input tensor stride: [input_dim_size * stride, stride, 1] // the index of the corresponding input tensor should be: - // [outer_index, index[0], left_over] - // Therefore, input_index = outer_index * (stride * input_dim_size) + // [outter_index, index[0], left_over] + // Therefore, input_index = outter_index * (stride * input_dim_size) // + index[0] * stride + left_over; - size_t outer_index = o / (stride * output_dim_size); + coord_t outer_index = o / (stride * output_dim_size); // coord_t index_2 = (o / stride) % dim_size - size_t left_over = o % stride; - size_t input_idx = + coord_t left_over = o % stride; + coord_t input_idx = outer_index * (stride * input_dim_size) + index[o] * stride + left_over; atomicAdd(&input_grad[input_idx], output_grad[o]); @@ -78,100 +78,97 @@ __global__ void gather_backward(float const *output_grad, template struct ForwardKernel { - void operator()(cudaStream_t stream, - GatherPerDeviceState const &m, + void operator()(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - /*size_t stride = 1; - for (int i = 0; i < m->legion_dim; i++) { - stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1); - } - size_t dim_size = - output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] + - 1; -*/ - gather_forward> - <<>>(input.get(), - index.get(), - output.get(), - output.shape.get_volume(), - stride, - input_dim_size, - output_dim_size); + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { + gather_forward<<>>( + input.get_float_ptr(), + index.get(), + output.get_float_ptr(), + output_size, + stride, + input_dim_size, + output_dim_size); } }; -void forward_kernel(cudaStream_t stream, +template +struct BackwardKernel { + void operator()(ffStream_t stream, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad, + coord_t output_size, + coord_t stride, + coord_t input_dim_size, + coord_t output_dim_size) { + gather_backward<<>>( + output_grad.get_float_ptr(), + index.get(), + input_grad.get_float_ptr(), + output_size, + stride, + input_dim_size, + output_dim_size); + } +}; + +void forward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &output, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - DataTypeDispatch1{}(m.index_data_type, + GenericTensorAccessorW const &output) { + checkCUDA(get_legion_stream(&stream)); + coord_t stride = 1; + for (int i = 0; i < m.legion_dim; i++) { + stride *= output.shape[legion_dim_t(i)] + 1; + } + + coord_t output_dim_size = output.shape[legion_dim_t(m.legion_dim)] + 1; + coord_t input_dim_size = input.shape[legion_dim_t(m.legion_dim)] + 1; + + assert(index.data_type == DataType::INT32 || + index.data_type == DataType::INT64); + + DataTypeDispatch1{}(index.data_type, stream, - m, input, index, output, + output.shape.get_volume(), stride, input_dim_size, output_dim_size); } -template -struct BackwardKernel { - void operator()(cudaStream_t stream, - GatherPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - /*size_t stride = 1; - for (int i = 0; i < m->legion_dim; i++) { - stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1); - } - size_t dim_size = output_grad.domain.hi()[m->legion_dim] - - output_grad.domain.lo()[m->legion_dim] + 1; - */ - gather_backward> - <<>>(output_grad.get(), - index.get(), - input_grad.get(), - output_grad.shape.get_volume(), - stride, - input_dim_size, - output_dim_size); - } -}; - -void backward_kernel(cudaStream_t stream, +void backward_kernel(ffStream_t stream, GatherPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad, - size_t stride, - size_t input_dim_size, - size_t output_dim_size) { - DataTypeDispatch1{}(m.index_data_type, + GenericTensorAccessorW const &input_grad) { + checkCUDA(get_legion_stream(&stream)); + coord_t stride = 1; + for (int i = 0; i < m.legion_dim; i++) { + stride *= output_grad.shape[legion_dim_t(i)] + 1; + } + + coord_t output_dim_size = output_grad.shape[legion_dim_t(m.legion_dim)] + 1; + coord_t input_dim_size = input_grad.shape[legion_dim_t(m.legion_dim)] + 1; + + assert(index.data_type == DataType::INT32 || + index.data_type == DataType::INT64); + + DataTypeDispatch1{}(index.data_type, stream, - m, output_grad, index, input_grad, + output_grad.shape.get_volume(), stride, input_dim_size, output_dim_size); diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h index f3c0e36cc1..6a3557e5b9 100644 --- a/lib/local-execution/include/profiling.h +++ b/lib/local-execution/include/profiling.h @@ -14,7 +14,7 @@ std::optional std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); if (elapsed.has_value()) { - spdlog::debug(elapsed.value()); + spdlog::debug("{} kernel execution time: {}", s, elapsed.value()); } return elapsed; } From a2a7e0a6529a95bd38d21584368901d272fc7e86 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 30 May 2024 16:20:34 -0700 Subject: [PATCH 17/24] Finish gather operator --- lib/kernels/src/cuda/ops/gather_kernels.cu | 22 ++ lib/local-execution/include/profiling.h | 2 +- lib/local-execution/src/ops/gather.cc | 225 +++++++++++ lib/local-execution/src/ops/gather.h | 30 ++ lib/op-attrs/include/op-attrs/ops/gather.h | 4 +- lib/runtime/src/ops/gather.cc | 416 --------------------- lib/runtime/src/ops/gather.h | 78 ---- 7 files changed, 280 insertions(+), 497 deletions(-) create mode 100644 lib/local-execution/src/ops/gather.cc create mode 100644 lib/local-execution/src/ops/gather.h delete mode 100644 lib/runtime/src/ops/gather.cc delete mode 100644 lib/runtime/src/ops/gather.h diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 829c952a49..dad14d89d7 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -124,6 +124,18 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); + + // Reference code for what's below -- not sure if I got the domain/array shape + // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) { + // stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1); + // } + // coord_t output_dim_size = + // output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] + + // 1; + // coord_t input_dim_size = + // input.domain.hi()[m->legion_dim] - input.domain.lo()[m->legion_dim] + + // 1; + coord_t stride = 1; for (int i = 0; i < m.legion_dim; i++) { stride *= output.shape[legion_dim_t(i)] + 1; @@ -152,6 +164,16 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad) { checkCUDA(get_legion_stream(&stream)); + + // Reference code for what's below -- not sure if I got the domain/array shape + // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) { + // stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1); + // } + // coord_t output_dim_size = output_grad.domain.hi()[m->legion_dim] - + // output_grad.domain.lo()[m->legion_dim] + 1; + // coord_t input_dim_size = input_grad.domain.hi()[m->legion_dim] - + // input_grad.domain.lo()[m->legion_dim] + 1; + coord_t stride = 1; for (int i = 0; i < m.legion_dim; i++) { stride *= output_grad.shape[legion_dim_t(i)] + 1; diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h index 6a3557e5b9..24753ba203 100644 --- a/lib/local-execution/include/profiling.h +++ b/lib/local-execution/include/profiling.h @@ -14,7 +14,7 @@ std::optional std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); if (elapsed.has_value()) { - spdlog::debug("{} kernel execution time: {}", s, elapsed.value()); + spdlog::debug("{}", s, elapsed.value()); } return elapsed; } diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc new file mode 100644 index 0000000000..5f3acff2f2 --- /dev/null +++ b/lib/local-execution/src/ops/gather.cc @@ -0,0 +1,225 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gather.h" +#include "kernels/gather_kernels.h" +#include "op-attrs/get_output_shapes.h" +#include + +namespace FlexFlow { + +using namespace FlexFlow::Kernels::Gather; + +enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE }; + +OpTaskInvocation init(GatherAttrs const &attrs) { + OpTaskBinding binding; + + binding.bind(INPUT, input_tensor(0)); + binding.bind(INDEX, input_tensor(1)); + binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); + + return {GATHER_INIT_TASK_ID, binding}; +} + +OpTaskInvocation forward(GatherAttrs const &attrs) { + OpTaskBinding binding; + + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); + + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INDEX, weight_tensor(0)); + + return {GATHER_FWD_TASK_ID, binding}; +} + +OpTaskInvocation backward(GatherAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); + + return {GATHER_BWD_TASK_ID, binding}; +} + +static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto index = acc.get_tensor(INDEX); + auto output = acc.get_tensor(OUTPUT); + + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + auto const &attrs = acc.get_argument(ATTRS); + int legion_dim = attrs.legion_dim; + + // Reference code for what's below -- not sure if I got the domain/array shape stuff right + // assert(input.domain.get_dim() == index.domain.get_dim()); + // assert(output.domain.get_dim() == index.domain.get_dim()); + // for (int i = 0; i < input.domain.get_dim(); i++) { + // assert(index.domain.hi()[i] == output.domain.hi()[i]); + // assert(index.domain.lo()[i] == output.domain.lo()[i]); + // if (i != m->legion_dim) { + // assert(input.domain.hi()[i] == index.domain.hi()[i]); + // assert(input.domain.lo()[i] == index.domain.lo()[i]); + // } + // } + + assert (input.shape.get_dim() == index.shape.get_dim()); + assert (output.shape.get_dim() == index.shape.get_dim()); + + for (int i = 0; i < input.shape.get_dim(); i++) { + assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]); + if (i != legion_dim) { + assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]); + } + } + + return DeviceSpecific({handle, legion_dim}); +} + +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + + auto input = acc.get_tensor(INPUT); + auto index = acc.get_tensor(INDEX); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Gather] forward_time = %.2lfms\n", + per_device_state, + input, + index, + output); +} + +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto index = acc.get_tensor(INDEX); + auto input_grad = acc.get_tensor_grad(INPUT); + + return profile(forward_kernel, + profiling, + "[Gather] forward_time = %.2lfms\n", + per_device_state, + output_grad, + index, + input_grad); +} + +CostMetrics measure_operator_cost(SimEnvFactory const &sim, + GatherAttrs const &attrs, + InputParallelTensorDesc const &input_shape, + InputParallelTensorDesc const &index_shape, + ProfilingSettings const &settings, + MachineView const &mv) { + + auto env = sim.new_environment(); + + std::vector output_shape = + get_output_shapes(attrs, input_shape.shape, index_shape.shape); + + SimTaskBinding fwd_binding; + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(ATTRS, attrs); + + fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind(INDEX, index_shape); + + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); + + auto fwd_accessor = env.get_fwd_accessor(GATHER_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(GATHER_BWD_TASK_ID, bwd_binding); + + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); +} + +template <> +OpTaskSignature init_signature() { + OpTaskSignature init(OpTaskType::INIT); + + init.add_input_slot(INPUT); + init.add_input_slot(INDEX); + init.add_output_slot(OUTPUT); + + init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); + + init.add_return_value(); + + return init; +} + +template <> +void register_task() { + register_task(GATHER_INIT_TASK_ID, + "Gather Init", + init_signature(), + init_task_impl); +} + +template <> +OpTaskSignature fwd_signature() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(ATTRS); + + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + fwd.add_weight_slot(INDEX); + + return fwd; +} + +template <> +void register_task() { + register_task(GATHER_FWD_TASK_ID, + "Gather Fwd", + fwd_signature(), + forward_task_impl); +} + +template <> +OpTaskSignature bwd_signature() { + OpTaskSignature bwd = + infer_bwd_signature(fwd_signature()); + + return bwd; +} + +template <> +void register_task() { + register_task(GATHER_BWD_TASK_ID, + "Gather Bwd", + bwd_signature(), + backward_task_impl); +} + +}; // namespace FlexFlow diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/src/ops/gather.h new file mode 100644 index 0000000000..e83f768cb7 --- /dev/null +++ b/lib/local-execution/src/ops/gather.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_GATHER_H +#define _FLEXFLOW_GATHER_H + +#include "op-attrs/ops/gather.h" +#include "op_task_invocation.h" +#include "sim_environment.h" + +namespace FlexFlow { + +template <> +void register_task(); +template <> +void register_task(); +template <> +void register_task(); + +OpTaskInvocation init(GatherAttrs const &); +OpTaskInvocation forward(GatherAttrs const &); +OpTaskInvocation backward(GatherAttrs const &); + +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + GatherAttrs const &attrs, + InputParallelTensorDesc const &input, + InputParallelTensorDesc const &index, + ProfilingSettings const &settings, + MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h index ca2406ef75..70dd65712a 100644 --- a/lib/op-attrs/include/op-attrs/ops/gather.h +++ b/lib/op-attrs/include/op-attrs/ops/gather.h @@ -9,9 +9,9 @@ namespace FlexFlow { struct GatherAttrs { - ff_dim_t dim; + req legion_dim; }; -FF_VISITABLE_STRUCT(GatherAttrs, dim); +FF_VISITABLE_STRUCT(GatherAttrs, legion_dim); CHECK_VALID_OP_ATTR(GatherAttrs); } // namespace FlexFlow diff --git a/lib/runtime/src/ops/gather.cc b/lib/runtime/src/ops/gather.cc deleted file mode 100644 index 9ef53ffc6a..0000000000 --- a/lib/runtime/src/ops/gather.cc +++ /dev/null @@ -1,416 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "gather.h" -#include "embedding.h" -#include "kernels/gather_kernels.h" -#include "legion/legion_utilities.h" - -namespace FlexFlow { - -// declare Legion names -using Legion::ArgumentMap; -using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; -using Legion::Runtime; -using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; -using PCG::Node; - -using namespace FlexFlow::Kernels::Gather; - -GatherParams Gather::get_params() const { - GatherParams params; - params.legion_dim = this->legion_dim; - params.layer_guid = this->layer_guid; - return params; -} - -Tensor FFModel::gather(const Tensor input, - const Tensor index, - int dim, - char const *name) { - Layer *gather = new Layer(this, - OP_GATHER, - DT_FLOAT, - name, - 2 /*inputs*/, - 0 /*weights*/, - 1 /*output*/, - input, - index); - assert(index->data_type == DT_INT32 || index->data_type == DT_INT64); - assert(input->num_dims == index->num_dims); - int legion_dim = input->num_dims - 1 - dim; - // https://pytorch.org/docs/stable/generated/torch.gather.html - // Currently we assume index.size(d) == input.size(d) for all - // dimensions d != dim, which is a stronger constraint that PyTorch's - for (int i = 0; i < input->num_dims; i++) { - if (i != legion_dim) { - assert(input->dims[i] == index->dims[i]); - } - } - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < index->num_dims; i++) { - dims[i] = index->dims[i]; - } - gather->outputs[0] = create_tensor_legion_ordering( - index->num_dims, dims, input->data_type, gather, 0, true /*create_grad*/); - gather->add_int_property("legion_dim", legion_dim); - layers.push_back(gather); - return gather->outputs[0]; -} - -Op *Gather::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("legion_dim", value); - int legion_dim = value; - return new Gather( - model, layer->layer_guid, inputs[0], inputs[1], legion_dim, layer->name); -} - -Gather::Gather(FFModel &model, - GatherParams const ¶ms, - std::pair const &inputs, - char const *name) - : Gather(model, - params.layer_guid, - inputs.first, - inputs.second, - params.legion_dim, - name) {} - -Gather::Gather(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor input, - const ParallelTensor index, - int _legion_dim, - char const *name) - : Op(model, - OP_GATHER, - input->data_type, - name, - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - index), - legion_dim(_legion_dim) { - layer_guid = _layer_guid; - // Assume that input and index have the same paralleldim except - // for the legion_dim-th dim, which cannot be parallelized - for (int i = 0; i < input->num_dims; i++) { - if (i != legion_dim) { - assert(input->dims[i] == index->dims[i]); - } - } - assert(index->dims[legion_dim].degree == 1); - assert(input->dims[legion_dim].degree == 1); - // output has the same parallel dims as index - ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < index->num_dims; i++) { - dims[i] = index->dims[i]; - } - outputs[0] = model.create_parallel_tensor_legion_ordering( - index->num_dims, dims, input->data_type, this); -} - -void Gather::serialize(Legion::Serializer &sez) const { - GatherParams params = get_params(); - sez.serialize(params.legion_dim); - sez.serialize(this->layer_guid.id); -} - -using PCG::Node; -/*static*/ -Node Gather::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 2); - int legion_dim; - dez.deserialize(legion_dim); - size_t id; - dez.deserialize(id); - LayerID layer_guid(id); - - GatherParams params; - params.legion_dim = legion_dim; - params.layer_guid = layer_guid; - return ff.get_or_create_node({inputs[0], inputs[1]}, params); -} - -Op *Gather::materialize(FFModel &ff, - ParallelTensor inputs[], - int num_inputs) const { - GatherParams params = get_params(); - return new Gather(ff, params, {inputs[0], inputs[1]}, this->name); -} - -void Gather::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(GATHER_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Gather)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); -} - -PerDeviceOpState *Gather::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - Gather const *gather = (Gather const *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - GatherMeta *m = new GatherMeta(handle, gather); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - assert(input.domain.get_dim() == index.domain.get_dim()); - assert(output.domain.get_dim() == index.domain.get_dim()); - for (int i = 0; i < input.domain.get_dim(); i++) { - assert(index.domain.hi()[i] == output.domain.hi()[i]); - assert(index.domain.lo()[i] == output.domain.lo()[i]); - if (i != m->legion_dim) { - assert(input.domain.hi()[i] == index.domain.hi()[i]); - assert(input.domain.lo()[i] == index.domain.lo()[i]); - } - } - return m; -} - -void Gather::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(GATHER_FWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, false), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} - -void Gather::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - GatherMeta const *m = *((GatherMeta **)task->local_args); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input, index, output); -} - -void Gather::backward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(GATHER_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(2, FID_DATA); - runtime->execute_index_space(ctx, launcher); -} - -void Gather::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 3); - assert(task->regions.size() == 3); - GatherMeta const *m = *((GatherMeta **)task->local_args); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR index = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad, index, input_grad); -} - -bool Gather::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_input, sub_index, sub_output; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - if (!inputs[1]->get_sub_tensor(mv, sub_index)) { - return false; - } - GatherMeta *m = new GatherMeta(sim->handler, this); - sim->free_all(); - bool out_of_memory = false; - Domain input_domain = sub_input.get_domain(); - void *input_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW input_acc( - inputs[0]->data_type, input_domain, input_ptr); - Domain index_domain = sub_index.get_domain(); - void *index_ptr = sim->allocate(sub_index.get_volume(), inputs[1]->data_type); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW index_acc( - inputs[1]->data_type, index_domain, index_ptr); - out_of_memory = out_of_memory || (input_ptr == NULL) || (index_ptr == NULL); - Domain out_domain = sub_output.get_domain(); - void *output_ptr = - sim->allocate(sub_output.get_volume(), outputs[0]->data_type); - out_of_memory = out_of_memory || (output_ptr == NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - GenericTensorAccessorW output_acc( - outputs[0]->data_type, out_domain, output_ptr); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(m, input_acc, index_acc, output_acc); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - backward = [&] { - backward_kernel_wrapper(m, output_acc, index_acc, input_acc); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); - - if (sim->computationMode == COMP_MODE_TRAINING) { - printf("[Measure Gather] name(%s) forward_time(%.4lf) " - "backward_time(%.4lf)\n", - name, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - printf("[Measure Gather] name(%s) forward_time(%.4lf)\n", - name, - cost_metrics.forward_time); - } - delete m; - return true; -} - -}; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::GatherParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.legion_dim); - hash_combine(key, params.layer_guid.id); - return key; -} -}; // namespace std diff --git a/lib/runtime/src/ops/gather.h b/lib/runtime/src/ops/gather.h deleted file mode 100644 index 1ea20b71f5..0000000000 --- a/lib/runtime/src/ops/gather.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef _FLEXFLOW_OPS_GATHER_H -#define _FLEXFLOW_OPS_GATHER_H - -#include "op-attrs/ops/gather.h" -#include "op_task_invocation.h" -#include "sim_environment.h" - -namespace FlexFlow { - -template <> -void register_task(); -template <> -void register_task(); -template <> -void register_task(); - -OpTaskInvocation init(GatherAttrs const &); -OpTaskInvocation forward(GatherAttrs const &); -OpTaskInvocation backward(GatherAttrs const &); - -CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, - GatherAttrs const &attrs, - ParallelTensorShape const &input_shape, - ParallelTensorShape const &index_shape, - ProfilingSettings const &settings, - MachineView const &machine_view); - -/* class Gather : public Op { */ -/* public: */ -/* Gather(FFModel &model, */ -/* ParallelTensor const &input, */ -/* ParallelTensor const &index, */ -/* int legion_dim, */ -/* char const *name = nullptr); */ -/* void init(FFModel const &) override; */ -/* void forward(FFModel const &) override; */ -/* void backward(FFModel const &) override; */ - -/* static Op * */ -/* create_operator_from_layer(FFModel &model, */ -/* Layer const *layer, */ -/* std::vector const &inputs); - */ - -/* static PerDeviceOpState *init_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void forward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void backward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* bool measure_operator_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ -/* void serialize(Legion::Serializer &s) const override; */ -/* /1* static PCG::Node deserialize(FFModel &ff, *1/ */ -/* /1* Legion::Deserializer &d, *1/ */ -/* /1* ParallelTensor inputs[], *1/ */ -/* /1* int num_inputs); *1/ */ -/* Op *materialize(FFModel &ff, */ -/* ParallelTensor inputs[], */ -/* int num_inputs) const override; */ - -/* public: */ -/* int legion_dim; */ -/* }; */ - -} // namespace FlexFlow - -#endif From e0b259cdd75d250f8f44e8e3a6477d7438036776 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 30 May 2024 16:26:32 -0700 Subject: [PATCH 18/24] Format --- lib/local-execution/src/ops/gather.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index 5f3acff2f2..566fef2785 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -57,7 +57,8 @@ OpTaskInvocation backward(GatherAttrs const &attrs) { return {GATHER_BWD_TASK_ID, binding}; } -static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto index = acc.get_tensor(INDEX); auto output = acc.get_tensor(OUTPUT); @@ -66,8 +67,8 @@ static DeviceSpecific init_task_impl(TaskArgumentAccessor auto const &attrs = acc.get_argument(ATTRS); int legion_dim = attrs.legion_dim; - // Reference code for what's below -- not sure if I got the domain/array shape stuff right - // assert(input.domain.get_dim() == index.domain.get_dim()); + // Reference code for what's below -- not sure if I got the domain/array shape + // stuff right assert(input.domain.get_dim() == index.domain.get_dim()); // assert(output.domain.get_dim() == index.domain.get_dim()); // for (int i = 0; i < input.domain.get_dim(); i++) { // assert(index.domain.hi()[i] == output.domain.hi()[i]); @@ -78,8 +79,8 @@ static DeviceSpecific init_task_impl(TaskArgumentAccessor // } // } - assert (input.shape.get_dim() == index.shape.get_dim()); - assert (output.shape.get_dim() == index.shape.get_dim()); + assert(input.shape.get_dim() == index.shape.get_dim()); + assert(output.shape.get_dim() == index.shape.get_dim()); for (int i = 0; i < input.shape.get_dim(); i++) { assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]); From 55971f26764bcbb1261bdcd5772cf7ebc5a3bcc6 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 30 May 2024 20:11:42 -0700 Subject: [PATCH 19/24] Fix substitutions --- lib/substitutions/src/operator_attributes.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/substitutions/src/operator_attributes.cc b/lib/substitutions/src/operator_attributes.cc index 8bd8688194..be5d63024e 100644 --- a/lib/substitutions/src/operator_attributes.cc +++ b/lib/substitutions/src/operator_attributes.cc @@ -129,7 +129,7 @@ std::optional get_attribute(GatherAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::AXIS: - return p.dim; + return p.legion_dim; default: return std::nullopt; } From da38f0a0901e18aefc56a28446a2ae90399e996e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Fri, 31 May 2024 18:57:59 -0700 Subject: [PATCH 20/24] Fix legion dim in gather --- lib/kernels/include/kernels/array_shape.h | 2 +- lib/kernels/include/kernels/gather_kernels.h | 2 +- lib/kernels/src/cuda/ops/gather_kernels.cu | 64 ++++++------------- .../include}/legion_tensor_shape.h | 4 +- lib/local-execution/include/profiling.h | 2 +- lib/local-execution/src/ops/gather.cc | 17 +---- lib/op-attrs/include/op-attrs/ops/gather.h | 4 +- lib/substitutions/src/operator_attributes.cc | 2 +- 8 files changed, 32 insertions(+), 65 deletions(-) rename lib/{runtime/src => local-execution/include}/legion_tensor_shape.h (92%) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 36796bc504..15f14f8757 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -42,7 +42,7 @@ struct ArrayShape { ArrayShape reversed_dim_order() const; ArrayShape sub_shape(std::optional start, - std::optional end); + std::optional end) const; public: LegionTensorDims dims; diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 305ccc8e26..13bf4b898a 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -8,7 +8,7 @@ namespace FlexFlow { struct GatherPerDeviceState { PerDeviceFFHandle handle; - int legion_dim; + legion_dim_t legion_dim; }; FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index dad14d89d7..286acf7376 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -32,18 +32,18 @@ __global__ void gather_forward(float const *input, CUDA_KERNEL_LOOP(o, output_size) { // output tensor shape: [*, output_dim_size, stride] // output tensor stride: [output_dim_size * stride, stride, 1] - // output tensor index: [outter_index, index_2, left_over] + // output tensor index: [outer_index, index_2, left_over] // input tensor shape: [*, input_dim_size, stride] // input tensor stride: [input_dim_size * stride, stride, 1] // the index of the corresponding input tensor should be: - // [outter_index, index[0], left_over] - // Therefore, input_index = outter_index * (stride * input_dim_size) + // [outer_index, index[0], left_over] + // Therefore, input_index = outer_index * (stride * input_dim_size) // + index[0] * stride + left_over; - coord_t outter_index = o / (stride * output_dim_size); + coord_t outer_index = o / (stride * output_dim_size); // coord_t index_2 = (o / stride) % dim_size coord_t left_over = o % stride; - coord_t input_idx = outter_index * (stride * input_dim_size) + - index[o] * stride + left_over; + coord_t input_idx = + outer_index * (stride * input_dim_size) + index[o] * stride + left_over; output[o] = input[input_idx]; } } @@ -59,12 +59,12 @@ __global__ void gather_backward(float const *output_grad, CUDA_KERNEL_LOOP(o, output_size) { // output tensor shape: [*, output_dim_size, stride] // output tensor stride: [output_dim_size * stride, stride, 1] - // output tensor index: [outter_index, index_2, left_over] + // output tensor index: [outer_index, index_2, left_over] // input tensor shape: [*, input_dim_size, stride] // input tensor stride: [input_dim_size * stride, stride, 1] // the index of the corresponding input tensor should be: - // [outter_index, index[0], left_over] - // Therefore, input_index = outter_index * (stride * input_dim_size) + // [outer_index, index[0], left_over] + // Therefore, input_index = outer_index * (stride * input_dim_size) // + index[0] * stride + left_over; coord_t outer_index = o / (stride * output_dim_size); // coord_t index_2 = (o / stride) % dim_size @@ -125,24 +125,12 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - // Reference code for what's below -- not sure if I got the domain/array shape - // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) { - // stride *= (output.domain.hi()[i] - output.domain.lo()[i] + 1); - // } - // coord_t output_dim_size = - // output.domain.hi()[m->legion_dim] - output.domain.lo()[m->legion_dim] + - // 1; - // coord_t input_dim_size = - // input.domain.hi()[m->legion_dim] - input.domain.lo()[m->legion_dim] + - // 1; - - coord_t stride = 1; - for (int i = 0; i < m.legion_dim; i++) { - stride *= output.shape[legion_dim_t(i)] + 1; - } - - coord_t output_dim_size = output.shape[legion_dim_t(m.legion_dim)] + 1; - coord_t input_dim_size = input.shape[legion_dim_t(m.legion_dim)] + 1; + coord_t stride = + output.shape + .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1}) + .get_volume(); + coord_t output_dim_size = output.shape[m.legion_dim]; + coord_t input_dim_size = input.shape[m.legion_dim]; assert(index.data_type == DataType::INT32 || index.data_type == DataType::INT64); @@ -165,22 +153,12 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &input_grad) { checkCUDA(get_legion_stream(&stream)); - // Reference code for what's below -- not sure if I got the domain/array shape - // stuff right coord_t stride = 1; for (int i = 0; i < m->legion_dim; i++) { - // stride *= (output_grad.domain.hi()[i] - output_grad.domain.lo()[i] + 1); - // } - // coord_t output_dim_size = output_grad.domain.hi()[m->legion_dim] - - // output_grad.domain.lo()[m->legion_dim] + 1; - // coord_t input_dim_size = input_grad.domain.hi()[m->legion_dim] - - // input_grad.domain.lo()[m->legion_dim] + 1; - - coord_t stride = 1; - for (int i = 0; i < m.legion_dim; i++) { - stride *= output_grad.shape[legion_dim_t(i)] + 1; - } - - coord_t output_dim_size = output_grad.shape[legion_dim_t(m.legion_dim)] + 1; - coord_t input_dim_size = input_grad.shape[legion_dim_t(m.legion_dim)] + 1; + coord_t stride = + output_grad.shape + .sub_shape(std::nullopt, legion_dim_t{m.legion_dim.value() + 1}) + .get_volume(); + coord_t output_dim_size = output_grad.shape[m.legion_dim]; + coord_t input_dim_size = input_grad.shape[m.legion_dim]; assert(index.data_type == DataType::INT32 || index.data_type == DataType::INT64); diff --git a/lib/runtime/src/legion_tensor_shape.h b/lib/local-execution/include/legion_tensor_shape.h similarity index 92% rename from lib/runtime/src/legion_tensor_shape.h rename to lib/local-execution/include/legion_tensor_shape.h index 1f5fab76a6..ff96ba9a15 100644 --- a/lib/runtime/src/legion_tensor_shape.h +++ b/lib/local-execution/include/legion_tensor_shape.h @@ -28,8 +28,8 @@ struct LegionTensorShape : public use_visitable_cmp, DataType data_type; }; -ff_dim_t to_ff(legion_dim_t, int num_dims); -legion_dim_t to_legion(ff_dim_t, int num_dims); +ff_dim_t to_ff(legion_dim_t, size_t num_dims); +legion_dim_t to_legion(ff_dim_t, size_t num_dims); ff_dim_t to_ff(legion_dim_t, TensorShape const &); legion_dim_t to_legion(ff_dim_t, TensorShape const &); diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/profiling.h index 24753ba203..bd50801fc4 100644 --- a/lib/local-execution/include/profiling.h +++ b/lib/local-execution/include/profiling.h @@ -14,7 +14,7 @@ std::optional std::optional elapsed = profiling_wrapper(f, profiling, std::forward(ts)...); if (elapsed.has_value()) { - spdlog::debug("{}", s, elapsed.value()); + spdlog::debug(s, elapsed.value()); } return elapsed; } diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index 566fef2785..0f53348cbe 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -15,6 +15,7 @@ #include "gather.h" #include "kernels/gather_kernels.h" +#include "legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" #include @@ -65,26 +66,14 @@ static DeviceSpecific PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto const &attrs = acc.get_argument(ATTRS); - int legion_dim = attrs.legion_dim; - - // Reference code for what's below -- not sure if I got the domain/array shape - // stuff right assert(input.domain.get_dim() == index.domain.get_dim()); - // assert(output.domain.get_dim() == index.domain.get_dim()); - // for (int i = 0; i < input.domain.get_dim(); i++) { - // assert(index.domain.hi()[i] == output.domain.hi()[i]); - // assert(index.domain.lo()[i] == output.domain.lo()[i]); - // if (i != m->legion_dim) { - // assert(input.domain.hi()[i] == index.domain.hi()[i]); - // assert(input.domain.lo()[i] == index.domain.lo()[i]); - // } - // } + legion_dim_t legion_dim = to_legion(attrs.ff_dim, input.shape.num_dims()); assert(input.shape.get_dim() == index.shape.get_dim()); assert(output.shape.get_dim() == index.shape.get_dim()); for (int i = 0; i < input.shape.get_dim(); i++) { assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]); - if (i != legion_dim) { + if (i != legion_dim.value()) { assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]); } } diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h index 70dd65712a..85a732e3d6 100644 --- a/lib/op-attrs/include/op-attrs/ops/gather.h +++ b/lib/op-attrs/include/op-attrs/ops/gather.h @@ -9,9 +9,9 @@ namespace FlexFlow { struct GatherAttrs { - req legion_dim; + req ff_dim; }; -FF_VISITABLE_STRUCT(GatherAttrs, legion_dim); +FF_VISITABLE_STRUCT(GatherAttrs, ff_dim); CHECK_VALID_OP_ATTR(GatherAttrs); } // namespace FlexFlow diff --git a/lib/substitutions/src/operator_attributes.cc b/lib/substitutions/src/operator_attributes.cc index be5d63024e..2c4cdfbcd8 100644 --- a/lib/substitutions/src/operator_attributes.cc +++ b/lib/substitutions/src/operator_attributes.cc @@ -129,7 +129,7 @@ std::optional get_attribute(GatherAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::AXIS: - return p.legion_dim; + return p.ff_dim; default: return std::nullopt; } From 5f539a3a1edcf72bdb1ec58b099fd0053f0789a0 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sat, 1 Jun 2024 16:00:38 -0700 Subject: [PATCH 21/24] Format string fixes --- lib/local-execution/src/ops/attention.cc | 4 ++-- lib/local-execution/src/ops/batch_matmul.cc | 4 ++-- lib/local-execution/src/ops/batch_norm.cc | 4 ++-- lib/local-execution/src/ops/cast.cc | 4 ++-- lib/local-execution/src/ops/combine.cc | 4 ++-- lib/local-execution/src/ops/concat.cc | 4 ++-- lib/local-execution/src/ops/conv_2d.cc | 4 ++-- lib/local-execution/src/ops/dropout.cc | 4 ++-- lib/local-execution/src/ops/element_binary.cc | 4 ++-- lib/local-execution/src/ops/element_unary.cc | 4 ++-- lib/local-execution/src/ops/embedding.cc | 4 ++-- lib/local-execution/src/ops/flat.cc | 4 ++-- lib/local-execution/src/ops/gather.cc | 6 +++--- lib/local-execution/src/ops/layer_norm.cc | 4 ++-- lib/local-execution/src/ops/linear.cc | 4 ++-- lib/local-execution/src/ops/partition.cc | 4 ++-- lib/local-execution/src/ops/pool_2d.cc | 4 ++-- lib/local-execution/src/ops/reduce.cc | 4 ++-- lib/local-execution/src/ops/reduction.cc | 4 ++-- lib/local-execution/src/ops/replicate.cc | 4 ++-- lib/local-execution/src/ops/reshape.cc | 4 ++-- lib/local-execution/src/ops/reverse.cc | 4 ++-- lib/local-execution/src/ops/softmax.cc | 4 ++-- lib/local-execution/src/ops/split.cc | 4 ++-- lib/local-execution/src/ops/topk.cc | 4 ++-- lib/local-execution/src/ops/transpose.cc | 4 ++-- lib/op-attrs/include/op-attrs/ops/gather.h | 4 ++-- lib/substitutions/src/operator_attributes.cc | 2 +- 28 files changed, 56 insertions(+), 56 deletions(-) diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 414b71ec70..3f11829d2f 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -168,7 +168,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[MultiHeadAttention] forward_time = %.2lfms\n", + "[MultiHeadAttention] forward_time = {:.2lf}ms\n", per_device_state, query.get_float_ptr(), key.get_float_ptr(), @@ -207,7 +207,7 @@ static std::optional return profile(backward_kernel, profiling, - "[MultiHeadAttention] backward_time = %.2lfms\n", + "[MultiHeadAttention] backward_time = {:.2lf}ms\n", per_device_state, query.get_float_ptr(), query_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index eccbe5a475..76bc88eae6 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -85,7 +85,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[BatchMatmul] forward_time = %.2lfms\n", + "[BatchMatmul] forward_time = {:.2lf}ms\n", handle, output.get_float_ptr(), a_input.get_float_ptr(), @@ -138,7 +138,7 @@ static std::optional return profile(backward_kernel, profiling, - "[BatchMatmul] backward_time = %.2lfms\n", + "[BatchMatmul] backward_time = {:.2lf}ms\n", handle, output.get_float_ptr(), output_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 5e640d70e0..97830f90fe 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -106,7 +106,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[BatchNorm] forward_time = %.2lfms\n", + "[BatchNorm] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr(), @@ -130,7 +130,7 @@ static std::optional return profile(backward_kernel, profiling, - "[BatchNorm] backward_time = %.2lfms\n", + "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 5647d7e7f2..7a74c8824d 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -52,7 +52,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Cast] forward_time = %.2lfms\n", + "[Cast] forward_time = {:.2lf}ms\n", input, output, input.data_type, @@ -71,7 +71,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Cast] forward_time = %.2lfms\n", + "[Cast] forward_time = {:.2lf}ms\n", input_grad, output_grad, input.data_type, diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index 0bce55722a..a39a503333 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -50,7 +50,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Combine] forward_time = %.2lfms\n", + "[Combine] forward_time = {:.2lf}ms\n", input, output); } @@ -64,7 +64,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Combine] forward_time = %.2lfms\n", + "[Combine] forward_time = {:.2lf}ms\n", input_grad, output_grad); } diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 087f08b577..3cbc232fac 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -54,7 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Concat] forward_time = %.2lfms\n", + "[Concat] forward_time = {:.2lf}ms\n", output, inputs, attrs.axis); @@ -72,7 +72,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Concat] backward_time = %.2lfms\n", + "[Concat] backward_time = {:.2lf}ms\n", output_grad, input_grads, attrs.axis); diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index a53b259fac..eef4c21a45 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -92,7 +92,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Conv2d] forward_time = %.2lfms\n", + "[Conv2d] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr(), @@ -119,7 +119,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Conv2d] backward_time = %.2lfms\n", + "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), input_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index 4935091ee5..3db1e7b8eb 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -61,7 +61,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Dropout] forward_time = %.2lfms\n", + "[Dropout] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr()); @@ -79,7 +79,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Dropout] backward_time = %.2lfms\n", + "[Dropout] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), input_grad.get_float_ptr()); diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc index b5588e04fd..a2e9ee2ba8 100644 --- a/lib/local-execution/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -84,7 +84,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[ElementBinary] forward_time = %.2lfms\n", + "[ElementBinary] forward_time = {:.2lf}ms\n", per_device_state, input_lhs.get_float_ptr(), input_rhs.get_float_ptr(), @@ -111,7 +111,7 @@ static std::optional return profile(backward_kernel, profiling, - "[ElementBinary] backward_time = %.2lfms\n", + "[ElementBinary] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), input_lhs.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index ddec57414a..2ad5d797f5 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -75,7 +75,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[ElementUnary] forward_time = %.2lfms\n", + "[ElementUnary] forward_time = {:.2lf}ms\n", per_device_state, attrs, handle, @@ -99,7 +99,7 @@ static std::optional return profile(backward_kernel, profiling, - "[ElementUnary] backward_time = %.2lfms\n", + "[ElementUnary] backward_time = {:.2lf}ms\n", per_device_state, attrs, handle, diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc index bac48c4b24..6ce13d88c9 100644 --- a/lib/local-execution/src/ops/embedding.cc +++ b/lib/local-execution/src/ops/embedding.cc @@ -53,7 +53,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Embedding] forward_time = %.2lfms\n", + "[Embedding] forward_time = {:.2lf}ms\n", input, output, weight, @@ -76,7 +76,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Embedding] forward_time = %.2lfms\n", + "[Embedding] forward_time = {:.2lf}ms\n", input, output, weight_grad, diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 9849bd3b73..194d84aaa8 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -33,7 +33,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Flat] forward_time = %.2lfms\n", + "[Flat] forward_time = {:.2lf}ms\n", input, output.get_float_ptr()); } @@ -48,7 +48,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Flat] forward_time = %.2lfms\n", + "[Flat] forward_time = {:.2lf}ms\n", input, input_grad.get_float_ptr(), output_grad.get_float_ptr()); diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index 0f53348cbe..091abd0ed3 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -66,7 +66,7 @@ static DeviceSpecific PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto const &attrs = acc.get_argument(ATTRS); - legion_dim_t legion_dim = to_legion(attrs.ff_dim, input.shape.num_dims()); + legion_dim_t legion_dim = to_legion(attrs.dim, input.shape.num_dims()); assert(input.shape.get_dim() == index.shape.get_dim()); assert(output.shape.get_dim() == index.shape.get_dim()); @@ -92,7 +92,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Gather] forward_time = %.2lfms\n", + "[Gather] forward_time = {:.2lf}ms\n", per_device_state, input, index, @@ -111,7 +111,7 @@ static std::optional return profile(forward_kernel, profiling, - "[Gather] forward_time = %.2lfms\n", + "[Gather] forward_time = {:.2lf}ms\n", per_device_state, output_grad, index, diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index fb97f946eb..620758772c 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -78,7 +78,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[LayerNorm] forward time = %.2lfms\n", + "[LayerNorm] forward time = {:.2lf}ms\n", state, input, output, @@ -101,7 +101,7 @@ static std::optional return profile(backward_kernel, profiling, - "[LayerNorm] backward time = %.2lfms\n", + "[LayerNorm] backward time = {:.2lf}ms\n", state, output_grad, input, diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 08e8fa3f68..36533be211 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -105,7 +105,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Linear] forward_time = %.2lfms\n", + "[Linear] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr(), @@ -144,7 +144,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Linear] backward_time = %.2lfms\n", + "[Linear] backward_time = {:.2lf}ms\n", per_device_state, (void *)input.get_float_ptr(), (void *)input_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/partition.cc b/lib/local-execution/src/ops/partition.cc index 1d358b52f5..4b09ad026b 100644 --- a/lib/local-execution/src/ops/partition.cc +++ b/lib/local-execution/src/ops/partition.cc @@ -73,7 +73,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Reparition/Partition] forward_time = %.2lfms\n", + "[Reparition/Partition] forward_time = {:.2lf}ms\n", per_device_state, input, output); @@ -89,7 +89,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Reparition/Partition] backward_time = %.2lfms\n", + "[Reparition/Partition] backward_time = {:.2lf}ms\n", per_device_state, output_grad, input_grad); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 576a5a8d23..989f390380 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -113,7 +113,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Pool2D] forward_time = %.2lfms\n", + "[Pool2D] forward_time = {:.2lf}ms\n", state, input.get_float_ptr(), output.get_float_ptr()); @@ -132,7 +132,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Pool2D] backward_time = %.2lfms\n", + "[Pool2D] backward_time = {:.2lf}ms\n", state, input.get_float_ptr(), input_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index 0ccd7be6e3..98d1a6f522 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -83,7 +83,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Reduce] forward_time = %.2lfms\n", + "[Reduce] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr()); @@ -119,7 +119,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Reduce] backward_time = %.2lfms\n", + "[Reduce] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), input_grad.get_float_ptr()); diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index 86f300df63..3fa300f64d 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -55,7 +55,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling_settings, - "[Reduction] forward_time = %.2lfms\n", + "[Reduction] forward_time = {:.2lf}ms\n", input, output, num_replicas); @@ -69,7 +69,7 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, - "[Reduction] backward_time = %.2lfms\n", + "[Reduction] backward_time = {:.2lf}ms\n", input_grad, output_grad); } diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 3322f8a1ce..a441985b78 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -54,7 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[replicate] forward_time = %.2lfms\n", + "[replicate] forward_time = {:.2lf}ms\n", input, output); } @@ -69,7 +69,7 @@ static std::optional return profile(backward_kernel, profiling, - "[replicate] backward_time = %.2lfms\n", + "[replicate] backward_time = {:.2lf}ms\n", input_grad, output_grad, attrs.replicate_degree); diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index c53fe5d78b..efee73645b 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -69,7 +69,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Reshape] forward time = %.2lfms\n", + "[Reshape] forward time = {:.2lf}ms\n", per_device_state, input, output); @@ -86,7 +86,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Reshape] backward time = %.2lfms\n", + "[Reshape] backward time = {:.2lf}ms\n", per_device_state, input_grad, output_grad); diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 49f1e51076..7fefb3d357 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -63,7 +63,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[reverse] forward_time = %.2lfms\n", + "[reverse] forward_time = {:.2lf}ms\n", input.get_float_ptr(), output.get_float_ptr(), num_out_blks, @@ -93,7 +93,7 @@ static std::optional return profile(backward_kernel, profiling, - "[reverse] backward_time = %.2lfms\n", + "[reverse] backward_time = {:.2lf}ms\n", output_grad.get_float_ptr(), input_grad.get_float_ptr(), num_out_blks, diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 5a65127140..ea857c680b 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -72,7 +72,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[SoftMax] forward_time = %.2lfms\n", + "[SoftMax] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr()); @@ -93,7 +93,7 @@ static std::optional return profile(backward_kernel, profiling, - "[SoftMax] backward_time = %.2lfms\n", + "[SoftMax] backward_time = {:.2lf}ms\n", input_grad.get_float_ptr(), output_grad.get_float_ptr(), output_grad.shape.get_volume()); diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index ffb40515ad..13e95d37f9 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -76,7 +76,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, profiling, - "Split forward_time = %.2lfms\n", + "Split forward_time = {:.2lf}ms\n", &output_float_ptr, input.get_float_ptr(), out_block_size, @@ -106,7 +106,7 @@ static std::optional float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, profiling, - "Split backward_time = %.2lfms\n", + "Split backward_time = {:.2lf}ms\n", input_grad.get_float_ptr(), &output_grad_ptr, out_block_size, diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index f6783a2d6c..8aceb9c6d4 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -81,7 +81,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[TopK] forward_time = %.2lfms\n", + "[TopK] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr(), @@ -109,7 +109,7 @@ static std::optional return profile(backward_kernel, profiling, - "[TopK] backward_time = %.2lfms\n", + "[TopK] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), indices.get_int32_ptr(), diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index f580a46792..c998484455 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -84,7 +84,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Transpose] Forward_time = %.2lf [ms]", + "[Transpose] Forward_time = {:.2lf} [ms]", per_device_state, input, output); @@ -101,7 +101,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Transpose] Backward_time = %.2lf [ms]", + "[Transpose] Backward_time = {:.2lf} [ms]", per_device_state, input_grad, output_grad); diff --git a/lib/op-attrs/include/op-attrs/ops/gather.h b/lib/op-attrs/include/op-attrs/ops/gather.h index 85a732e3d6..ca2406ef75 100644 --- a/lib/op-attrs/include/op-attrs/ops/gather.h +++ b/lib/op-attrs/include/op-attrs/ops/gather.h @@ -9,9 +9,9 @@ namespace FlexFlow { struct GatherAttrs { - req ff_dim; + ff_dim_t dim; }; -FF_VISITABLE_STRUCT(GatherAttrs, ff_dim); +FF_VISITABLE_STRUCT(GatherAttrs, dim); CHECK_VALID_OP_ATTR(GatherAttrs); } // namespace FlexFlow diff --git a/lib/substitutions/src/operator_attributes.cc b/lib/substitutions/src/operator_attributes.cc index 2c4cdfbcd8..8bd8688194 100644 --- a/lib/substitutions/src/operator_attributes.cc +++ b/lib/substitutions/src/operator_attributes.cc @@ -129,7 +129,7 @@ std::optional get_attribute(GatherAttrs const &p, OperatorAttributeKey key) { switch (key) { case OperatorAttributeKey::AXIS: - return p.ff_dim; + return p.dim; default: return std::nullopt; } From 26ddf7fe99b3b51fe9e32b015c5691380d3efa48 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sat, 1 Jun 2024 16:19:49 -0700 Subject: [PATCH 22/24] Fix include --- .../include/{ => local-execution}/arg_ref.h | 4 ++-- .../include/{ => local-execution}/concrete_arg.h | 2 +- .../include/{ => local-execution}/config.h | 0 .../include/{ => local-execution}/cost_metrics.h | 0 .../{ => local-execution}/device_specific.h | 2 +- .../{ => local-execution}/legion_tensor_shape.h | 0 .../{ => local-execution}/local_allocator.h | 0 .../include/{ => local-execution}/op_arg_ref.h | 4 ++-- .../{ => local-execution}/op_task_invocation.h | 16 ++++++++-------- .../{ => local-execution}/op_task_signature.h | 8 ++++---- .../{ => local-execution}/op_tensor_spec.h | 2 +- .../include/{ => local-execution}/permissions.h | 0 .../include/{ => local-execution}/profiling.h | 0 .../{ => local-execution}/runtime_arg_ref.h | 6 +++--- .../{ => local-execution}/serialization.h | 0 .../{ => local-execution}/sim_environment.h | 6 +++--- .../include/{ => local-execution}/slot_id.h | 0 .../include/{ => local-execution}/slot_type.h | 0 .../task_argument_accessor.h | 14 +++++++------- .../include/{ => local-execution}/tasks.h | 0 .../{ => local-execution}/tracked_allocator.h | 2 +- .../{ => local-execution}/variadic_tensor_ref.h | 4 ++-- lib/local-execution/src/local_allocator.cc | 2 +- lib/local-execution/src/op_arg_ref.cc | 2 +- lib/local-execution/src/op_task_invocation.cc | 2 +- lib/local-execution/src/op_task_signature.cc | 2 +- lib/local-execution/src/ops/attention.cc | 2 +- lib/local-execution/src/ops/attention.h | 4 ++-- lib/local-execution/src/ops/batch_matmul.cc | 2 +- lib/local-execution/src/ops/batch_matmul.h | 6 +++--- lib/local-execution/src/ops/batch_norm.h | 4 ++-- lib/local-execution/src/ops/cast.cc | 2 +- lib/local-execution/src/ops/cast.h | 4 ++-- lib/local-execution/src/ops/combine.cc | 4 ++-- lib/local-execution/src/ops/combine.h | 4 ++-- lib/local-execution/src/ops/concat.cc | 4 ++-- lib/local-execution/src/ops/concat.h | 4 ++-- lib/local-execution/src/ops/conv_2d.h | 4 ++-- lib/local-execution/src/ops/dropout.cc | 4 ++-- lib/local-execution/src/ops/dropout.h | 6 +++--- lib/local-execution/src/ops/element_binary.h | 2 +- lib/local-execution/src/ops/element_unary.h | 4 ++-- lib/local-execution/src/ops/embedding.cc | 2 +- lib/local-execution/src/ops/embedding.h | 4 ++-- lib/local-execution/src/ops/flat.cc | 2 +- lib/local-execution/src/ops/flat.h | 2 +- lib/local-execution/src/ops/gather.cc | 4 ++-- lib/local-execution/src/ops/gather.h | 4 ++-- lib/local-execution/src/ops/layer_norm.h | 4 ++-- lib/local-execution/src/ops/linear.cc | 2 +- lib/local-execution/src/ops/linear.h | 4 ++-- lib/local-execution/src/ops/noop.cc | 2 +- lib/local-execution/src/ops/noop.h | 2 +- lib/local-execution/src/ops/pool_2d.h | 4 ++-- lib/local-execution/src/ops/reduce.h | 4 ++-- lib/local-execution/src/ops/reduction.h | 4 ++-- lib/local-execution/src/ops/repartition.h | 4 ++-- lib/local-execution/src/ops/replicate.h | 4 ++-- lib/local-execution/src/ops/reshape.h | 4 ++-- lib/local-execution/src/ops/reverse.h | 4 ++-- lib/local-execution/src/ops/softmax.h | 4 ++-- lib/local-execution/src/ops/split.h | 4 ++-- lib/local-execution/src/ops/topk.h | 4 ++-- lib/local-execution/src/ops/transpose.h | 4 ++-- lib/local-execution/src/permissions.cc | 2 +- lib/local-execution/src/runtime_arg_ref.cc | 4 ++-- lib/local-execution/src/tracked_allocator.cc | 2 +- lib/local-execution/src/variadic_tensor_ref.cc | 2 +- 68 files changed, 112 insertions(+), 112 deletions(-) rename lib/local-execution/include/{ => local-execution}/arg_ref.h (94%) rename lib/local-execution/include/{ => local-execution}/concrete_arg.h (96%) rename lib/local-execution/include/{ => local-execution}/config.h (100%) rename lib/local-execution/include/{ => local-execution}/cost_metrics.h (100%) rename lib/local-execution/include/{ => local-execution}/device_specific.h (96%) rename lib/local-execution/include/{ => local-execution}/legion_tensor_shape.h (100%) rename lib/local-execution/include/{ => local-execution}/local_allocator.h (100%) rename lib/local-execution/include/{ => local-execution}/op_arg_ref.h (86%) rename lib/local-execution/include/{ => local-execution}/op_task_invocation.h (88%) rename lib/local-execution/include/{ => local-execution}/op_task_signature.h (95%) rename lib/local-execution/include/{ => local-execution}/op_tensor_spec.h (89%) rename lib/local-execution/include/{ => local-execution}/permissions.h (100%) rename lib/local-execution/include/{ => local-execution}/profiling.h (100%) rename lib/local-execution/include/{ => local-execution}/runtime_arg_ref.h (83%) rename lib/local-execution/include/{ => local-execution}/serialization.h (100%) rename lib/local-execution/include/{ => local-execution}/sim_environment.h (96%) rename lib/local-execution/include/{ => local-execution}/slot_id.h (100%) rename lib/local-execution/include/{ => local-execution}/slot_type.h (100%) rename lib/local-execution/include/{ => local-execution}/task_argument_accessor.h (94%) rename lib/local-execution/include/{ => local-execution}/tasks.h (100%) rename lib/local-execution/include/{ => local-execution}/tracked_allocator.h (94%) rename lib/local-execution/include/{ => local-execution}/variadic_tensor_ref.h (81%) diff --git a/lib/local-execution/include/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h similarity index 94% rename from lib/local-execution/include/arg_ref.h rename to lib/local-execution/include/local-execution/arg_ref.h index b0e2b57b05..50fe4e6f80 100644 --- a/lib/local-execution/include/arg_ref.h +++ b/lib/local-execution/include/local-execution/arg_ref.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H #include "kernels/ff_handle.h" -#include "profiling.h" -#include "serialization.h" +#include "local-execution/profiling.h" +#include "local-execution/serialization.h" #include "utils/type_index.h" #include "utils/visitable.h" diff --git a/lib/local-execution/include/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h similarity index 96% rename from lib/local-execution/include/concrete_arg.h rename to lib/local-execution/include/local-execution/concrete_arg.h index 072500f47e..2db5e45e9e 100644 --- a/lib/local-execution/include/concrete_arg.h +++ b/lib/local-execution/include/local-execution/concrete_arg.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H #define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H -#include "serialization.h" +#include "local-execution/serialization.h" #include "utils/type_index.h" #include diff --git a/lib/local-execution/include/config.h b/lib/local-execution/include/local-execution/config.h similarity index 100% rename from lib/local-execution/include/config.h rename to lib/local-execution/include/local-execution/config.h diff --git a/lib/local-execution/include/cost_metrics.h b/lib/local-execution/include/local-execution/cost_metrics.h similarity index 100% rename from lib/local-execution/include/cost_metrics.h rename to lib/local-execution/include/local-execution/cost_metrics.h diff --git a/lib/local-execution/include/device_specific.h b/lib/local-execution/include/local-execution/device_specific.h similarity index 96% rename from lib/local-execution/include/device_specific.h rename to lib/local-execution/include/local-execution/device_specific.h index a055f6d274..6136d16f2d 100644 --- a/lib/local-execution/include/device_specific.h +++ b/lib/local-execution/include/local-execution/device_specific.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H #define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H -#include "serialization.h" +#include "local-execution/serialization.h" #include "utils/exception.h" namespace FlexFlow { diff --git a/lib/local-execution/include/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h similarity index 100% rename from lib/local-execution/include/legion_tensor_shape.h rename to lib/local-execution/include/local-execution/legion_tensor_shape.h diff --git a/lib/local-execution/include/local_allocator.h b/lib/local-execution/include/local-execution/local_allocator.h similarity index 100% rename from lib/local-execution/include/local_allocator.h rename to lib/local-execution/include/local-execution/local_allocator.h diff --git a/lib/local-execution/include/op_arg_ref.h b/lib/local-execution/include/local-execution/op_arg_ref.h similarity index 86% rename from lib/local-execution/include/op_arg_ref.h rename to lib/local-execution/include/local-execution/op_arg_ref.h index 577ac7984a..1650656b42 100644 --- a/lib/local-execution/include/op_arg_ref.h +++ b/lib/local-execution/include/local-execution/op_arg_ref.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H -#include "arg_ref.h" -#include "device_specific.h" +#include "local-execution/arg_ref.h" +#include "local-execution/device_specific.h" #include "op-attrs/parallel_tensor_shape.h" namespace FlexFlow { diff --git a/lib/local-execution/include/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h similarity index 88% rename from lib/local-execution/include/op_task_invocation.h rename to lib/local-execution/include/local-execution/op_task_invocation.h index 1bf94a1b0d..9783d1fe88 100644 --- a/lib/local-execution/include/op_task_invocation.h +++ b/lib/local-execution/include/local-execution/op_task_invocation.h @@ -1,17 +1,17 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H -#include "concrete_arg.h" +#include "local-execution/concrete_arg.h" #include "kernels/accessor.h" -#include "op_arg_ref.h" -#include "op_task_signature.h" -#include "op_tensor_spec.h" -#include "profiling.h" -#include "runtime_arg_ref.h" -#include "tasks.h" +#include "local-execution/op_arg_ref.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/op_tensor_spec.h" +#include "local-execution/profiling.h" +#include "local-execution/runtime_arg_ref.h" +#include "local-execution/tasks.h" #include "utils/bidict.h" #include "utils/stack_map.h" -#include "variadic_tensor_ref.h" +#include "local-execution/variadic_tensor_ref.h" #include #include #include diff --git a/lib/local-execution/include/op_task_signature.h b/lib/local-execution/include/local-execution/op_task_signature.h similarity index 95% rename from lib/local-execution/include/op_task_signature.h rename to lib/local-execution/include/local-execution/op_task_signature.h index 840c321627..3bcb8397b7 100644 --- a/lib/local-execution/include/op_task_signature.h +++ b/lib/local-execution/include/local-execution/op_task_signature.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H -#include "serialization.h" -#include "slot_id.h" -#include "slot_type.h" -#include "tasks.h" +#include "local-execution/serialization.h" +#include "local-execution/slot_id.h" +#include "local-execution/slot_type.h" +#include "local-execution/tasks.h" #include "utils/type_index.h" #include "utils/visitable.h" diff --git a/lib/local-execution/include/op_tensor_spec.h b/lib/local-execution/include/local-execution/op_tensor_spec.h similarity index 89% rename from lib/local-execution/include/op_tensor_spec.h rename to lib/local-execution/include/local-execution/op_tensor_spec.h index c12b5342e1..cc2cd75153 100644 --- a/lib/local-execution/include/op_tensor_spec.h +++ b/lib/local-execution/include/local-execution/op_tensor_spec.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H -#include "op_task_signature.h" +#include "local-execution/op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/include/permissions.h b/lib/local-execution/include/local-execution/permissions.h similarity index 100% rename from lib/local-execution/include/permissions.h rename to lib/local-execution/include/local-execution/permissions.h diff --git a/lib/local-execution/include/profiling.h b/lib/local-execution/include/local-execution/profiling.h similarity index 100% rename from lib/local-execution/include/profiling.h rename to lib/local-execution/include/local-execution/profiling.h diff --git a/lib/local-execution/include/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h similarity index 83% rename from lib/local-execution/include/runtime_arg_ref.h rename to lib/local-execution/include/local-execution/runtime_arg_ref.h index 05afa456cf..295f32455c 100644 --- a/lib/local-execution/include/runtime_arg_ref.h +++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H -#include "arg_ref.h" -#include "config.h" -#include "device_specific.h" +#include "local-execution/arg_ref.h" +#include "local-execution/config.h" +#include "local-execution/device_specific.h" namespace FlexFlow { diff --git a/lib/local-execution/include/serialization.h b/lib/local-execution/include/local-execution/serialization.h similarity index 100% rename from lib/local-execution/include/serialization.h rename to lib/local-execution/include/local-execution/serialization.h diff --git a/lib/local-execution/include/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h similarity index 96% rename from lib/local-execution/include/sim_environment.h rename to lib/local-execution/include/local-execution/sim_environment.h index 4409ab8b55..efcc41d58b 100644 --- a/lib/local-execution/include/sim_environment.h +++ b/lib/local-execution/include/local-execution/sim_environment.h @@ -1,13 +1,13 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H #define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H -#include "cost_metrics.h" +#include "local-execution/cost_metrics.h" #include "kernels/accessor.h" #include "kernels/allocation.h" #include "op-attrs/parallel_tensor_shape.h" -#include "op_task_invocation.h" +#include "local-execution/op_task_invocation.h" #include "pcg/machine_view.h" -#include "task_argument_accessor.h" +#include "local-execution/task_argument_accessor.h" #include namespace FlexFlow { diff --git a/lib/local-execution/include/slot_id.h b/lib/local-execution/include/local-execution/slot_id.h similarity index 100% rename from lib/local-execution/include/slot_id.h rename to lib/local-execution/include/local-execution/slot_id.h diff --git a/lib/local-execution/include/slot_type.h b/lib/local-execution/include/local-execution/slot_type.h similarity index 100% rename from lib/local-execution/include/slot_type.h rename to lib/local-execution/include/local-execution/slot_type.h diff --git a/lib/local-execution/include/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h similarity index 94% rename from lib/local-execution/include/task_argument_accessor.h rename to lib/local-execution/include/local-execution/task_argument_accessor.h index 0656af0fe3..df0637142a 100644 --- a/lib/local-execution/include/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,17 +1,17 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H -#include "arg_ref.h" -#include "concrete_arg.h" -#include "config.h" -#include "device_specific.h" +#include "local-execution/arg_ref.h" +#include "local-execution/concrete_arg.h" +#include "local-execution/config.h" +#include "local-execution/device_specific.h" #include "kernels/accessor.h" #include "kernels/allocation.h" #include "kernels/linear_kernels.h" #include "op-attrs/parallel_tensor_shape.h" -#include "op_task_signature.h" -#include "permissions.h" -#include "tasks.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/permissions.h" +#include "local-execution/tasks.h" #include "utils/variant.h" #include #include diff --git a/lib/local-execution/include/tasks.h b/lib/local-execution/include/local-execution/tasks.h similarity index 100% rename from lib/local-execution/include/tasks.h rename to lib/local-execution/include/local-execution/tasks.h diff --git a/lib/local-execution/include/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h similarity index 94% rename from lib/local-execution/include/tracked_allocator.h rename to lib/local-execution/include/local-execution/tracked_allocator.h index 4f51670426..ea3eec64e0 100644 --- a/lib/local-execution/include/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H #include "kernels/allocation.h" -#include "local_allocator.h" +#include "local-execution/local_allocator.h" namespace FlexFlow { diff --git a/lib/local-execution/include/variadic_tensor_ref.h b/lib/local-execution/include/local-execution/variadic_tensor_ref.h similarity index 81% rename from lib/local-execution/include/variadic_tensor_ref.h rename to lib/local-execution/include/local-execution/variadic_tensor_ref.h index 091c55b0af..56da1bab64 100644 --- a/lib/local-execution/include/variadic_tensor_ref.h +++ b/lib/local-execution/include/local-execution/variadic_tensor_ref.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H -#include "arg_ref.h" -#include "op_tensor_spec.h" +#include "local-execution/arg_ref.h" +#include "local-execution/op_tensor_spec.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local_allocator.cc b/lib/local-execution/src/local_allocator.cc index 0bb7d04574..d393643ead 100644 --- a/lib/local-execution/src/local_allocator.cc +++ b/lib/local-execution/src/local_allocator.cc @@ -1,4 +1,4 @@ -#include "local_allocator.h" +#include "local-execution/local_allocator.h" #include "kernels/device.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/local-execution/src/op_arg_ref.cc index 6bea26a5a2..8e9b56272b 100644 --- a/lib/local-execution/src/op_arg_ref.cc +++ b/lib/local-execution/src/op_arg_ref.cc @@ -1,4 +1,4 @@ -#include "op_arg_ref.h" +#include "local-execution/op_arg_ref.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 5683cb12ec..adad2f3a72 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -1,4 +1,4 @@ -#include "op_task_invocation.h" +#include "local-execution/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc index 71642680a6..53a685910e 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/local-execution/src/op_task_signature.cc @@ -1,4 +1,4 @@ -#include "op_task_signature.h" +#include "local-execution/op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 3f11829d2f..6e6d23cd4a 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -15,7 +15,7 @@ #include "attention.h" #include "kernels/attention_kernels.h" -#include "op_task_signature.h" +#include "local-execution/op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/attention.h b/lib/local-execution/src/ops/attention.h index 601d8a4796..c8eb17ecec 100644 --- a/lib/local-execution/src/ops/attention.h +++ b/lib/local-execution/src/ops/attention.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_ATTENTION_H #define _FLEXFLOW_ATTENTION_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/attention.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index 76bc88eae6..187e97ecaa 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -15,9 +15,9 @@ #include "batch_matmul.h" #include "kernels/batch_matmul_kernels.h" +#include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/batch_matmul.h" -#include "op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h index 6791b11a8c..94457c22be 100644 --- a/lib/local-execution/src/ops/batch_matmul.h +++ b/lib/local-execution/src/ops/batch_matmul.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_BATCH_MATMUL_H #define _FLEXFLOW_BATCH_MATMUL_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/batch_matmul.h" -#include "op_task_invocation.h" -#include "op_task_signature.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/batch_norm.h b/lib/local-execution/src/ops/batch_norm.h index 6fae871c2c..1745a5cac8 100644 --- a/lib/local-execution/src/ops/batch_norm.h +++ b/lib/local-execution/src/ops/batch_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_BATCH_NORM_H #define _FLEXFLOW_BATCH_NORM_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/batch_norm.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 7a74c8824d..9e1f777d73 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -16,7 +16,7 @@ #include "cast.h" #include "kernels/cast_kernels.h" -#include "op_task_signature.h" +#include "local-execution/op_task_signature.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Cast; diff --git a/lib/local-execution/src/ops/cast.h b/lib/local-execution/src/ops/cast.h index ce9a93aa32..69aeadf497 100644 --- a/lib/local-execution/src/ops/cast.h +++ b/lib/local-execution/src/ops/cast.h @@ -15,9 +15,9 @@ #ifndef _FLEXFLOW_CAST_H #define _FLEXFLOW_CAST_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/cast.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index a39a503333..6df09b53f4 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -15,7 +15,7 @@ #include "combine.h" #include "kernels/combine_kernels.h" -#include "op_task_invocation.h" +#include "local-execution/op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -64,7 +64,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Combine] forward_time = {:.2lf}ms\n", + "[Combine] backward_time = {:.2lf}ms\n", input_grad, output_grad); } diff --git a/lib/local-execution/src/ops/combine.h b/lib/local-execution/src/ops/combine.h index 5923e9ebcc..f9349a01ef 100644 --- a/lib/local-execution/src/ops/combine.h +++ b/lib/local-execution/src/ops/combine.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_COMBINE_H #define _FLEXFLOW_COMBINE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/combine.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 3cbc232fac..f3c2eba48f 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -16,10 +16,10 @@ #include "concat.h" #include "kernels/concat_kernels.h" +#include "local-execution/op_task_signature.h" +#include "local-execution/variadic_tensor_ref.h" #include "op-attrs/get_output_shapes.h" -#include "op_task_signature.h" #include "utils/hash-utils.h" -#include "variadic_tensor_ref.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/concat.h b/lib/local-execution/src/ops/concat.h index d0a432e8b3..fa61d87e77 100644 --- a/lib/local-execution/src/ops/concat.h +++ b/lib/local-execution/src/ops/concat.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONCAT_H #define _FLEXFLOW_CONCAT_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/concat.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/conv_2d.h b/lib/local-execution/src/ops/conv_2d.h index 0e92b00553..0c8181adce 100644 --- a/lib/local-execution/src/ops/conv_2d.h +++ b/lib/local-execution/src/ops/conv_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONV_2D_H #define _FLEXFLOW_CONV_2D_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/conv_2d.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index 3db1e7b8eb..9d680054ea 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -1,8 +1,8 @@ #include "dropout.h" #include "kernels/dropout_kernels.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" -#include "op_task_invocation.h" -#include "op_task_signature.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/dropout.h b/lib/local-execution/src/ops/dropout.h index 4f22842c8a..53fbeb3857 100644 --- a/lib/local-execution/src/ops/dropout.h +++ b/lib/local-execution/src/ops/dropout.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_DROPOUT_H #define _FLEXFLOW_DROPOUT_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" +#include "local-execution/tasks.h" #include "op-attrs/ops/dropout.h" -#include "op_task_invocation.h" -#include "sim_environment.h" -#include "tasks.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/element_binary.h b/lib/local-execution/src/ops/element_binary.h index 342909c468..fa4202dffd 100644 --- a/lib/local-execution/src/ops/element_binary.h +++ b/lib/local-execution/src/ops/element_binary.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_ELEMENT_BINARY_H #define _FLEXFLOW_ELEMENT_BINARY_H +#include "local-execution/sim_environment.h" #include "op-attrs/ops/element_binary.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/element_unary.h b/lib/local-execution/src/ops/element_unary.h index 83f6177b8d..e0f58e8a75 100644 --- a/lib/local-execution/src/ops/element_unary.h +++ b/lib/local-execution/src/ops/element_unary.h @@ -1,9 +1,9 @@ #ifndef _ELEMENT_UNARY_H #define _ELEMENT_UNARY_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/element_unary.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc index 6ce13d88c9..27d667cd00 100644 --- a/lib/local-execution/src/ops/embedding.cc +++ b/lib/local-execution/src/ops/embedding.cc @@ -53,7 +53,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Embedding] forward_time = {:.2lf}ms\n", + "[Embedding] backward_time = {:.2lf}ms\n", input, output, weight, diff --git a/lib/local-execution/src/ops/embedding.h b/lib/local-execution/src/ops/embedding.h index b4caebf952..c33b1161bf 100644 --- a/lib/local-execution/src/ops/embedding.h +++ b/lib/local-execution/src/ops/embedding.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_EMBEDDING_H #define _FLEXFLOW_EMBEDDING_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/embedding.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 194d84aaa8..3c2499da79 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -48,7 +48,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Flat] forward_time = {:.2lf}ms\n", + "[Flat] backward_time = {:.2lf}ms\n", input, input_grad.get_float_ptr(), output_grad.get_float_ptr()); diff --git a/lib/local-execution/src/ops/flat.h b/lib/local-execution/src/ops/flat.h index 13246028fb..d9ea4d3985 100644 --- a/lib/local-execution/src/ops/flat.h +++ b/lib/local-execution/src/ops/flat.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_FLAT_H #define _FLEXFLOW_FLAT_H +#include "local-execution/sim_environment.h" #include "op-attrs/ops/flat.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index 091abd0ed3..deb436842a 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -15,7 +15,7 @@ #include "gather.h" #include "kernels/gather_kernels.h" -#include "legion_tensor_shape.h" +#include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" #include @@ -109,7 +109,7 @@ static std::optional auto index = acc.get_tensor(INDEX); auto input_grad = acc.get_tensor_grad(INPUT); - return profile(forward_kernel, + return profile(backward_kernel, profiling, "[Gather] forward_time = {:.2lf}ms\n", per_device_state, diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/src/ops/gather.h index e83f768cb7..e2de09d96a 100644 --- a/lib/local-execution/src/ops/gather.h +++ b/lib/local-execution/src/ops/gather.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_GATHER_H #define _FLEXFLOW_GATHER_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/gather.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/layer_norm.h b/lib/local-execution/src/ops/layer_norm.h index 83e6733bf6..4eadb9ff09 100644 --- a/lib/local-execution/src/ops/layer_norm.h +++ b/lib/local-execution/src/ops/layer_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/layer_norm.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 36533be211..e2c9d9aef4 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -1,8 +1,8 @@ #include "linear.h" #include "kernels/linear_kernels.h" +#include "local-execution/task_argument_accessor.h" #include "op-attrs/ff_dim.h" #include "op-attrs/get_output_shapes.h" -#include "task_argument_accessor.h" #include "utils/exception.h" #include "utils/graph/views.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/ops/linear.h b/lib/local-execution/src/ops/linear.h index 2b476382ef..2ff9016114 100644 --- a/lib/local-execution/src/ops/linear.h +++ b/lib/local-execution/src/ops/linear.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LINEAR_H #define _FLEXFLOW_LINEAR_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/linear.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc index 02ffeaf111..168d547c17 100644 --- a/lib/local-execution/src/ops/noop.cc +++ b/lib/local-execution/src/ops/noop.cc @@ -14,7 +14,7 @@ */ #include "noop.h" -#include "op_task_invocation.h" +#include "local-execution/op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/noop.h b/lib/local-execution/src/ops/noop.h index 17a9426e77..fab2cf1f86 100644 --- a/lib/local-execution/src/ops/noop.h +++ b/lib/local-execution/src/ops/noop.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_NOOP_H #define _FLEXFLOW_NOOP_H +#include "local-execution/op_task_invocation.h" #include "op-attrs/ops/input.h" #include "op-attrs/ops/noop.h" -#include "op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/pool_2d.h b/lib/local-execution/src/ops/pool_2d.h index 852110e2e2..0537e9f1c4 100644 --- a/lib/local-execution/src/ops/pool_2d.h +++ b/lib/local-execution/src/ops/pool_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_POOL_2D_H #define _FLEXFLOW_POOL_2D_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/pool_2d.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/reduce.h b/lib/local-execution/src/ops/reduce.h index 4c22a9127e..6d47ec2f4d 100644 --- a/lib/local-execution/src/ops/reduce.h +++ b/lib/local-execution/src/ops/reduce.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reduce.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/reduction.h b/lib/local-execution/src/ops/reduction.h index 071c4d2a7b..a69b75f310 100644 --- a/lib/local-execution/src/ops/reduction.h +++ b/lib/local-execution/src/ops/reduction.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REDUCTION_H #define _FLEXFLOW_REDUCTION_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reduction.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/repartition.h b/lib/local-execution/src/ops/repartition.h index 0c8cdaf0f9..a73bd3f808 100644 --- a/lib/local-execution/src/ops/repartition.h +++ b/lib/local-execution/src/ops/repartition.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_PARTITION_H #define _FLEXFLOW_PARTITION_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/repartition.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/replicate.h b/lib/local-execution/src/ops/replicate.h index 510676931b..339f805f2c 100644 --- a/lib/local-execution/src/ops/replicate.h +++ b/lib/local-execution/src/ops/replicate.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REPLICATE_H #define _FLEXFLOW_REPLICATE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/replicate.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/reshape.h b/lib/local-execution/src/ops/reshape.h index 0b845de5fc..14b22561a0 100644 --- a/lib/local-execution/src/ops/reshape.h +++ b/lib/local-execution/src/ops/reshape.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RESHAPE_H #define _FLEXFLOW_RESHAPE_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reshape.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/reverse.h b/lib/local-execution/src/ops/reverse.h index 68545644bd..5be501698c 100644 --- a/lib/local-execution/src/ops/reverse.h +++ b/lib/local-execution/src/ops/reverse.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REVERSE_H_ #define _FLEXFLOW_REVERSE_H_ +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/reverse.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/softmax.h b/lib/local-execution/src/ops/softmax.h index 8fe2f96eb5..a83d8f4116 100644 --- a/lib/local-execution/src/ops/softmax.h +++ b/lib/local-execution/src/ops/softmax.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SOFTMAX_H #define _FLEXFLOW_SOFTMAX_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/softmax.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/split.h b/lib/local-execution/src/ops/split.h index 1fdfdc2432..f51e0ea6af 100644 --- a/lib/local-execution/src/ops/split.h +++ b/lib/local-execution/src/ops/split.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SPLIT_H #define _FLEXFLOW_SPLIT_H +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/split.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/topk.h b/lib/local-execution/src/ops/topk.h index fcab2a5a31..db85fd9d03 100644 --- a/lib/local-execution/src/ops/topk.h +++ b/lib/local-execution/src/ops/topk.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TOPK_H_ #define _FLEXFLOW_TOPK_H_ +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/topk.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h index 6c6dffdc8a..daa64e8e59 100644 --- a/lib/local-execution/src/ops/transpose.h +++ b/lib/local-execution/src/ops/transpose.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TRANSPOSE_H_ #define _FLEXFLOW_TRANSPOSE_H_ +#include "local-execution/op_task_invocation.h" +#include "local-execution/sim_environment.h" #include "op-attrs/ops/transpose.h" -#include "op_task_invocation.h" -#include "sim_environment.h" namespace FlexFlow { diff --git a/lib/local-execution/src/permissions.cc b/lib/local-execution/src/permissions.cc index 2843dd1b70..e5c46b42f8 100644 --- a/lib/local-execution/src/permissions.cc +++ b/lib/local-execution/src/permissions.cc @@ -1,4 +1,4 @@ -#include "permissions.h" +#include "local-execution/permissions.h" #include "utils/exception.h" namespace FlexFlow { diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc index a9d573bbb5..df4f024f1d 100644 --- a/lib/local-execution/src/runtime_arg_ref.cc +++ b/lib/local-execution/src/runtime_arg_ref.cc @@ -1,5 +1,5 @@ -#include "runtime_arg_ref.h" -#include "device_specific.h" +#include "local-execution/runtime_arg_ref.h" +#include "local-execution/device_specific.h" namespace FlexFlow { diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index 6d06714252..68636906c3 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -1,4 +1,4 @@ -#include "tracked_allocator.h" +#include "local-execution/tracked_allocator.h" #include "kernels/device.h" namespace FlexFlow { diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/local-execution/src/variadic_tensor_ref.cc index 74d0f0d9e7..efd43a6648 100644 --- a/lib/local-execution/src/variadic_tensor_ref.cc +++ b/lib/local-execution/src/variadic_tensor_ref.cc @@ -1,4 +1,4 @@ -#include "variadic_tensor_ref.h" +#include "local-execution/variadic_tensor_ref.h" namespace FlexFlow { From 1dfc24e1653b4e139839af968b8266b674bcdb9e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sat, 1 Jun 2024 16:22:42 -0700 Subject: [PATCH 23/24] Gather backward time --- lib/local-execution/src/ops/gather.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index deb436842a..50b27d72a6 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -111,7 +111,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Gather] forward_time = {:.2lf}ms\n", + "[Gather] backward_time = {:.2lf}ms\n", per_device_state, output_grad, index, From c60efd91b29378c1bae9c88787a25abe4b651993 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sat, 1 Jun 2024 16:25:17 -0700 Subject: [PATCH 24/24] Format --- .../include/local-execution/op_task_invocation.h | 4 ++-- .../include/local-execution/sim_environment.h | 6 +++--- .../include/local-execution/task_argument_accessor.h | 8 ++++---- lib/local-execution/src/ops/embedding.cc | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h index 9783d1fe88..37ca5c239d 100644 --- a/lib/local-execution/include/local-execution/op_task_invocation.h +++ b/lib/local-execution/include/local-execution/op_task_invocation.h @@ -1,17 +1,17 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H -#include "local-execution/concrete_arg.h" #include "kernels/accessor.h" +#include "local-execution/concrete_arg.h" #include "local-execution/op_arg_ref.h" #include "local-execution/op_task_signature.h" #include "local-execution/op_tensor_spec.h" #include "local-execution/profiling.h" #include "local-execution/runtime_arg_ref.h" #include "local-execution/tasks.h" +#include "local-execution/variadic_tensor_ref.h" #include "utils/bidict.h" #include "utils/stack_map.h" -#include "local-execution/variadic_tensor_ref.h" #include #include #include diff --git a/lib/local-execution/include/local-execution/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h index efcc41d58b..78608a3228 100644 --- a/lib/local-execution/include/local-execution/sim_environment.h +++ b/lib/local-execution/include/local-execution/sim_environment.h @@ -1,13 +1,13 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H #define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H -#include "local-execution/cost_metrics.h" #include "kernels/accessor.h" #include "kernels/allocation.h" -#include "op-attrs/parallel_tensor_shape.h" +#include "local-execution/cost_metrics.h" #include "local-execution/op_task_invocation.h" -#include "pcg/machine_view.h" #include "local-execution/task_argument_accessor.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/machine_view.h" #include namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index df0637142a..663c862e18 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,17 +1,17 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/linear_kernels.h" #include "local-execution/arg_ref.h" #include "local-execution/concrete_arg.h" #include "local-execution/config.h" #include "local-execution/device_specific.h" -#include "kernels/accessor.h" -#include "kernels/allocation.h" -#include "kernels/linear_kernels.h" -#include "op-attrs/parallel_tensor_shape.h" #include "local-execution/op_task_signature.h" #include "local-execution/permissions.h" #include "local-execution/tasks.h" +#include "op-attrs/parallel_tensor_shape.h" #include "utils/variant.h" #include #include diff --git a/lib/local-execution/src/ops/embedding.cc b/lib/local-execution/src/ops/embedding.cc index 27d667cd00..00d6d033d4 100644 --- a/lib/local-execution/src/ops/embedding.cc +++ b/lib/local-execution/src/ops/embedding.cc @@ -53,7 +53,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, - "[Embedding] backward_time = {:.2lf}ms\n", + "[Embedding] forward_time = {:.2lf}ms\n", input, output, weight, @@ -76,7 +76,7 @@ static std::optional return profile(backward_kernel, profiling, - "[Embedding] forward_time = {:.2lf}ms\n", + "[Embedding] backward_time = {:.2lf}ms\n", input, output, weight_grad,