From 30959792bd0b82e49a5feb53ffd918e214149fed Mon Sep 17 00:00:00 2001 From: Kate Unger Date: Thu, 7 Sep 2023 13:50:08 -0700 Subject: [PATCH 1/5] conv_2d --- lib/kernels/include/kernels/conv_2d_kernels.h | 67 +- lib/runtime/src/ops/conv_2d.cc | 1137 +++-------------- lib/runtime/src/ops/conv_2d.h | 691 +++++++++- 3 files changed, 888 insertions(+), 1007 deletions(-) diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index 50b3c0601f..b327a2d2cd 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -5,45 +5,50 @@ namespace FlexFlow { -class Conv2DPerDeviceState : public PerDeviceOpState { -public: - Conv2DPerDeviceState(FFHandler handler); - ffTensorDescriptor_t inputTensor, biasTensor, outputTensor; +struct Conv2DPerDeviceState { + PerDeviceFFHandle handle; + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t biasTensor; + ffTensorDescriptor_t outputTensor; ffFilterDescriptor_t filterDesc; ffActivationDescriptor_t actiDesc; ffConvolutionDescriptor_t convDesc; ffConvolutionFwdAlgo_t fwdAlgo; ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; ffConvolutionBwdDataAlgo_t bwdDataAlgo; - bool relu, use_bias; - char op_name[MAX_OPNAME]; + req> activation; + req use_bias; }; +FF_VISITABLE_STRUCT_NO_EQ(Conv2DPerDeviceState, + handle, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo, + activation, + use_bias); + namespace Kernels { namespace Conv2D { -void init_kernel(Conv2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int kernel_h, - int kernel_w, - int groups, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - float const *input_ptr, - float *output_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *forward_time = nullptr, - float *backward_time = nullptr); +Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + ffTensorDescriptor_t inputTensor, + ffTensorDescriptor_t biasTensor, + ffTensorDescriptor_t outputTensor, + ffFilterDescriptor_t filterDesc, + ffActivationDescriptor_t actiDesc, + ffConvolutionDescriptor_t convDesc, + ffConvolutionFwdAlgo_t fwdAlgo, + ffConvolutionBwdFilterAlgo_t bwdFilterAlgo, + ffConvolutionBwdDataAlgo_t bwdDataAlgo, + req> relu, + bool use_bias); void forward_kernel(ffStream_t stream, Conv2DPerDeviceState const *m, @@ -58,12 +63,12 @@ void backward_kernel(ffStream_t stream, float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, float *bias_grad_ptr); } // namespace Conv2D } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H +#endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H \ No newline at end of file diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/runtime/src/ops/conv_2d.cc index e362c73f92..0679fff96b 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/runtime/src/ops/conv_2d.cc @@ -1,1051 +1,238 @@ #include "conv_2d.h" #include "kernels/conv_2d_kernels.h" -#include "layer.h" #include "legion/legion_utilities.h" #include "mpark/variant.hpp" -#include "task_spec.h" +#include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { -enum Slots { - INPUT, - OUTPUT, - FILTER, - BIAS, - FILTER_GRAD, - INPUT_GRAD, - OUTPUT_GRAD, - BIAS_GRAD, - ATTRS, - PROFILING, -} - -// declare Legion names -using Legion::ArgumentMap; using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; using Legion::Runtime; using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Conv2D; -Tensor FFModel::conv2d(Tensor const &input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, - ActiMode activation, - int groups, - bool use_bias, - Layer const *shared_op, - Initializer *kernel_initializer, - Initializer *bias_initializer, - char const *name) { - assert(input->num_dims() == 4); /*NCHW*/ - - Conv2DAttrs attrs = {outChannels, - kernelH, - kernelW, - strideH, - strideW, - paddingH, - paddingW, - groups, - activation, - use_bias}; - - TensorShape output_shape = get_output_shape(attrs, input->get_shape()); - Tensor output = this->tensor_mgr.create(output_shape, CreateGrad::YES, conv); - - std::vector weights; - - TensorShape kernel_shape = get_kernel_shape(attrs, input->get_shape()); - weights.push_back(this->tensor_mgr.create( - kernel_shape, CreateGrad::YES, kernel_initializer, CHOSEN_SYNC_TYPE)); - - if (use_bias) { - TensorShape bias_shape = get_bias_shape(attrs, input->get_shape()); - weights.push_back(this->tensor_mgr.create( - bias_shape, CreateGrad::YES, bias_initializer, CHOSEN_SYNC_TYPE)); - } - - Layer *conv = - this->layer_mgr.create(attrs, DT_FLOAT, name, {input}, weights, {output}); - - //{ - // int numdims = 4; - // int dims[MAX_TENSOR_DIM]; - // dims[3] = input->dims[3]; - // dims[2] = outChannels; - // dims[1] = 1 + (input->dims[1] + 2 * paddingH - kernelH) / strideH; - // dims[0] = 1 + (input->dims[0] + 2 * paddingW - kernelW) / strideW; - // conv->outputs[0] = create_tensor_legion_ordering( - // numdims, dims, DT_FLOAT, conv, 0, true /*create_grad*/); - //} - //{ - // int dims[4] = {kernelW, kernelH, input->dims[2], outChannels}; - // conv->weights[0] = create_weight_legion_ordering(4, - // dims, - // DT_FLOAT, - // conv, - // true /*create_grad*/, - // kernel_initializer, - // CHOSEN_SYNC_TYPE); - //} - // if (use_bias) { - // int dims[1] = {outChannels}; - // conv->weights[1] = create_weight_legion_ordering(1, - // dims, - // DT_FLOAT, - // conv, - // true /*create_grad*/, - // bias_initializer, - // CHOSEN_SYNC_TYPE); - //} - conv->add_initializer("kernel", kernel_initializer); - conv->add_initializer("bias", bias_initializer); - /* layers.push_back(conv); */ - return conv->outputs[0]; -} - -Op *Conv2D::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - return new Conv2D(model, - get(layer->attrs), - inputs, - layer->name, - false /*allocate_weights*/ - ); -} - -/* void Conv2DParams::mark_replica_dims( */ -/* ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* if (output_dims != nullptr) { */ -/* output_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* kernel_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ -/* } */ -/* if (bias_dims != nullptr) { */ -/* bias_dims[Conv2DBias::REPLICA_1].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_2].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_3].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_4].is_replica_dim = true; */ -/* } */ -/* } */ - -/* int Conv2DParams::output_size(ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM]) const - * { */ -/* int input_w = input.dims[Conv2DInput::WIDTH].size; */ -/* int input_h = input.dims[Conv2DInput::HEIGHT].size; */ - -/* output_dims[Conv2DOutput::SAMPLE].size = - * input.dims[Conv2DInput::SAMPLE].size; */ -/* output_dims[Conv2DOutput::CHANNEL].size = out_channels; */ -/* output_dims[Conv2DOutput::HEIGHT].size = */ -/* 1 + (input_h + 2 * padding_h - kernel_h) / stride_h; */ -/* output_dims[Conv2DOutput::WIDTH].size = */ -/* 1 + (input_w + 2 * padding_w - kernel_w) / stride_w; */ - -/* return input.num_dims; */ -/* }; */ - -/* int Conv2DParams::kernel_size(ParallelTensorShape const &input, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM]) const - * { */ -/* kernel_dims[Conv2DKernel::CHANNEL_OUT].size = this->out_channels; */ -/* kernel_dims[Conv2DKernel::CHANNEL_IN].size = */ -/* input.dims[Conv2DInput::CHANNEL].size / this->groups; */ -/* kernel_dims[Conv2DKernel::HEIGHT].size = */ -/* this->kernel_h * input.dims[Conv2DInput::HEIGHT].degree; */ -/* kernel_dims[Conv2DKernel::WIDTH].size = */ -/* this->kernel_w * input.dims[Conv2DInput::WIDTH].degree; */ - -/* return Conv2DKernel::NUMDIM; */ -/* } */ - -/* int Conv2DParams::bias_size(ParallelTensorShape const &input, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* bias_dims[Conv2DBias::CHANNEL].size = this->out_channels; */ - -/* return Conv2DBias::NUMDIM; */ -/* }; */ - -/* void Conv2DParams::solve_dims(ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* int *output_ndims, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* int *kernel_ndims, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ -/* int *bias_ndims) const { */ -/* assert((output_dims == nullptr) == (output_ndims == nullptr)); */ -/* assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); */ -/* assert((bias_dims == nullptr) == (bias_ndims == nullptr)); */ - -/* std::vector mapping; */ -/* Conv2D::construct_mappings(mapping, this->use_bias); */ - -/* this->mark_replica_dims(input, output_dims, kernel_dims, bias_dims); */ - -/* std::vector output_dim_sets; */ -/* if (output_dims != nullptr) { */ -/* output_dim_sets.push_back(output_dims); */ -/* } */ - -/* std::vector weight_dim_sets; */ -/* if (kernel_dims != nullptr) { */ -/* weight_dim_sets.push_back(kernel_dims); */ -/* } */ -/* if (bias_dims != nullptr && this->use_bias) { */ -/* weight_dim_sets.push_back(bias_dims); */ -/* } */ - -/* solve_parallel_dim_mappings( */ -/* mapping, {input.dims}, weight_dim_sets, output_dim_sets); */ - -/* if (output_dims != nullptr) { */ -/* *output_ndims = this->output_size(input, output_dims); */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* *kernel_ndims = this->kernel_size(input, kernel_dims); */ -/* } */ -/* if (bias_dims != nullptr && this->use_bias) { */ -/* *bias_ndims = this->bias_size(input, bias_dims); */ -/* } */ -/* } */ - -/*static*/ -/* void Conv2D::construct_mappings(std::vector &out, - */ -/* bool use_bias) { */ -/* Conv2D::construct_output_mappings(out); */ -/* Conv2D::construct_weight_mappings(out, use_bias); */ -/* } */ - -/*static*/ -/* void Conv2D::construct_output_mappings( */ -/* std::vector &out) { */ -/* Op::construct_output_parallel_dims( */ -/* out, */ -/* {{Conv2DInput::CHANNEL, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DOutput::REPLICA}, */ -/* {Conv2DInput::SAMPLE, MappingOperation::PARTITION, - * Conv2DOutput::SAMPLE}, */ -/* {Conv2DInput::REPLICA, */ -/* MappingOperation::PARTITION, */ -/* Conv2DOutput::CHANNEL}, */ -/* {Conv2DInput::HEIGHT, MappingOperation::PARTITION, - * Conv2DOutput::HEIGHT}, */ -/* {Conv2DInput::WIDTH, MappingOperation::PARTITION, - * Conv2DOutput::WIDTH}}); */ -/* } */ - -/*static*/ -/* void Conv2D::construct_weight_mappings( */ -/* std::vector &out, bool use_bias) { */ -/* Op::construct_weight_parallel_dims( */ -/* out, */ -/* { */ -/* {Conv2DInput::REPLICA, */ -/* MappingOperation::PARTITION, */ -/* Conv2DKernel::CHANNEL_OUT}, */ -/* {Conv2DInput::SAMPLE, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::REPLICA}, */ -/* {Conv2DInput::CHANNEL, */ -/* MappingOperation::PARTITION, */ -/* Conv2DKernel::CHANNEL_IN}, */ -/* {Conv2DInput::HEIGHT, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::HEIGHT}, // Kernel::{HEIGHT, WEIGHT} would both work - */ -/* // here */ -/* {Conv2DInput::WIDTH, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::WIDTH}, // same as above */ -/* }, */ -/* Conv2DInput::INDEX, */ -/* Conv2DKernel::INDEX); */ - -/* if (use_bias) { */ -/* Op::construct_weight_parallel_dims( */ -/* out, */ -/* {{Conv2DInput::REPLICA, Conv2DBias::REPLICA_1}, */ -/* {Conv2DInput::SAMPLE, Conv2DBias::REPLICA_2}, */ -/* {Conv2DInput::CHANNEL, Conv2DBias::CHANNEL}, */ -/* {Conv2DInput::HEIGHT, Conv2DBias::REPLICA_3}, */ -/* {Conv2DInput::WIDTH, Conv2DBias::REPLICA_4}}, */ -/* Conv2DInput::INDEX, */ -/* Conv2DBias::INDEX); */ -/* } */ -/* } */ - -Conv2D::Conv2D(FFModel &model, - Conv2D const &other, - const ParallelTensor input, - bool allocate_weights) - : Conv2D(model, - other.layer_guid, - input, - other.out_channels, - other.kernel_h, - other.kernel_w, - other.stride_h, - other.stride_w, - other.padding_h, - other.padding_w, - other.activation, - other.groups, - other.use_bias, - allocate_weights, - other.name) {} - -Conv2D::Conv2D(FFModel &model, - Conv2DAttrs const &attrs, - std::vector const &inputs, - char const *name, - bool allocate_weights) - : Conv2D(model, - params.layer_guid, - input, - params.out_channels, - params.kernel_h, - params.kernel_w, - params.stride_h, - params.stride_w, - params.padding_h, - params.padding_w, - params.activation, - params.groups, - params.use_bias, - allocate_weights, - name) {} - -/* bool Conv2DParams::is_valid(ParallelTensorShape const &input) const { */ -/* ParallelTensorShape output_shape, kernel_shape, bias_shape; */ -/* this->solve_dims(input, */ -/* output_shape.dims, */ -/* &output_shape.num_dims, */ -/* kernel_shape.dims, */ -/* &kernel_shape.num_dims, */ -/* bias_shape.dims, */ -/* &bias_shape.num_dims); */ -/* bool is_valid = true; */ -/* is_valid &= input.is_valid(); */ -/* is_valid &= output_shape.is_valid(); */ -/* is_valid &= kernel_shape.is_valid(); */ -/* if (use_bias) { */ -/* is_valid &= bias_shape.is_valid(); */ -/* } */ - -/* // TODO FIXME: Currently disable parallelizing the height and width - * dimension */ -/* if (input.dims[0].degree > 1 || input.dims[1].degree > 1) { */ -/* return false; */ -/* } */ - -/* return is_valid; */ -/* } */ - -Conv2D::Conv2D(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, - ActiMode activation, - int groups, - bool use_bias, - bool allocate_weights, - char const *name) - : Op(model, - OP_CONV2D, - DT_FLOAT, - name, - 1 /*inputs*/, - use_bias ? 2 : 1 /*weights*/, - allocate_weights, - 1 /*outputs*/, - input), - in_channels(input->dims[Conv2DInput::CHANNEL].size / - input->dims[Conv2DInput::CHANNEL].degree), - out_channels(outChannels), kernel_h(kernelH), kernel_w(kernelW), - stride_h(strideH), stride_w(strideW), padding_h(paddingH), - padding_w(paddingW), activation(activation), groups(groups), - use_bias(use_bias) { - // overwrite layer_guid - layer_guid = _layer_guid; - assert(input->num_dims == Conv2DInput::NUMDIM); - assert(this->stride_h > 0); - assert(this->stride_w > 0); - - ParallelDim output_dims[MAX_TENSOR_DIM], kernel_dims[MAX_TENSOR_DIM], - bias_dims[MAX_TENSOR_DIM]; - int output_ndims, kernel_ndims, bias_ndims; - - this->construct_mappings(*this->parallel_dims_mapping, this->use_bias); - this->get_params().solve_dims(this->inputs[0]->get_shape(), - output_dims, - &output_ndims, - kernel_dims, - &kernel_ndims, - bias_dims, - &bias_ndims); - - if (allocate_weights) { - Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - - weights[Conv2DKernel::INDEX] = - model.create_parallel_weight_legion_ordering(kernel_ndims, - kernel_dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - - if (use_bias) { - Initializer *bias_initializer = new ZeroInitializer(); - - weights[Conv2DBias::INDEX] = - model.create_parallel_weight_legion_ordering(bias_ndims, - bias_dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - bias_initializer, - CHOSEN_SYNC_TYPE); - } - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - output_ndims, output_dims, DT_FLOAT, this); - - assert(check_output_input_weight_parallel_dims(allocate_weights)); -} - -static OpTaskSignature get_init_task_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_arg_slot(PROFILING); - - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT, WRITE_ONLY); - init.add_param_slot(FILTER); - init.add_param_slot(BIAS); - init.add_param_grad_slot(FILTER_GRAD, WRITE_ONLY); - init.add_input_grad_slot(INPUT_GRAD); - - return init; -} - -static OpTaskSignature get_fwd_task_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(ATTRS); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT, WRITE_ONLY); - fwd.add_param_slot(FILTER); - fwd.add_param_slot(BIAS); - - return fwd; -} - -static OpTaskSignature get_bwd_task_signature() { - OpTaskSignature bwd(OpTaskType::BWD); +enum Slots { + INPUT, + OUTPUT, + FILTER, + BIAS, + ATTRS, + PROFILING, + PER_DEVICE_STATE, + HANDLE +}; - bwd.add_arg_slot(ATTRS); +OpTaskInvocation init(Conv2DAttrs const &attrs) { + OpTaskBinding binding; - bwd.add_input_slot(INPUT); - bwd.add_input_grad_slot(INPUT_GRAD, READ_WRITE); - bwd.add_output_slot(OUTPUT); - bwd.add_output_grad_slot(OUTPUT_GRAD, READ_WRITE); - bwd.add_param_slot(FILTER); - bwd.add_param_grad_slot(FILTER_GRAD, READ_WRITE); - bwd.add_param_grad_slot(BIAS_GRAD, READ_WRITE); + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); - return bwd; + return {CONV2D_INIT_TASK_ID, binding}; } -OpTaskBinding Conv2d::get_init_task_binding() const { +OpTaskInvocation forward(Conv2DAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(ATTRS, this->attrs); - binding.bind_arg(PROFILING, this->profiling); + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, param_tensor(0)); - binding.bind(BIAS, param_tensor(1)); - binding.bind(FILTER_GRAD, param_tensor(0).grad()); - binding.bind(INPUT_GRAD, input_tensor(0).grad()); + binding.bind(FILTER, weight_tensor(0)); + binding.bind(BIAS, weight_tensor(1)); - return binding; + return {CONV2D_FWD_TASK_ID, binding}; } -OpTaskBinding Conv2d::get_fwd_task_binding() const { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, this->attrs); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, param_tensor(0)); - binding.bind(BIAS, param_tensor(1)); +OpTaskInvocation backward(Conv2DAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return binding; + return {CONV2D_BWD_TASK_ID, binding}; } -OpTaskBinding Conv2d::get_bwd_task_binding() const { - OpTaskBinding binding; +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { - binding.bind_arg(ATTRS, this->attrs); + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + auto const &attrs = acc.get_argument(ATTRS); - binding.bind(INPUT, input_tensor(0)); - binding.bind(INPUT_GRAD, input_tensor(0).grad()); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(OUTPUT_GRAD, output_tensor(0).grad()); - binding.bind(FILTER, param_tensor(0)); - binding.bind(FILTER_GRAD, param_tensor(0).grad()); - binding.bind(BIAS_GRAD, param_tensor(1).grad()); + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t biasTensor; + ffTensorDescriptor_t outputTensor; + ffFilterDescriptor_t filterDesc; + ffActivationDescriptor_t actiDesc; + ffConvolutionDescriptor_t convDesc; + ffConvolutionFwdAlgo_t fwdAlgo; + ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; + ffConvolutionBwdDataAlgo_t bwdDataAlgo; - return binding; -} + DeviceSpecific per_device_state = + acc.create_device_specific( + init_kernel(handle, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo, + attrs.activation, + attrs.use_bias)); -void Conv2D::init(FFModel const &ff) { - this->execute_task(ff, CONV2D_INIT_TASK_ID, get_init_task_signature()); - // assert(check_output_input_weight_same_parallel_is()); - // parallel_is = outputs[0]->parallel_is; - // ArgumentMap argmap; - // Context ctx = ff.config.lg_ctx; - // Runtime *runtime = ff.config.lg_hlr; - // set_argumentmap_for_init(ff, argmap); - // IndexLauncher launcher(CONV2D_INIT_TASK_ID, - // parallel_is, - // TaskArgument(this, sizeof(Conv2D)), - // argmap, - // Predicate::TRUE_PRED, - // false /*must*/, - // 0 /*mapper_id*/, - // outputs[0]->machine_view.hash()); - // launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - // 0 /*projection id*/, - // READ_ONLY, - // EXCLUSIVE, - // inputs[0]->region)); - // launcher.add_field(0, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // outputs[0]->region)); - // launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(weights[0]->part, - // 0 /*projection id*/, - // READ_ONLY, - // EXCLUSIVE, - // weights[0]->region)); - // launcher.add_field(2, FID_DATA); - // // launcher.add_region_requirement( - // // RegionRequirement(weights[1]->part, 0/*projection id*/, - // // READ_ONLY, EXCLUSIVE, weights[1]->region)); - // // launcher.add_field(3, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // weights[0]->region_grad)); - // launcher.add_field(3, FID_DATA); - // // launcher.add_region_requirement( - // // RegionRequirement(inputs[0]->part_grad, 0/*projection id*/, - // // WRITE_ONLY, EXCLUSIVE, inputs[0]->region_grad)); - // // launcher.add_field(4, FID_DATA); - // FutureMap fm = runtime->execute_index_space(ctx, launcher); - // fm.wait_all_results(); - // set_opmeta_from_futuremap(ff, fm); + return per_device_state; } -/* - regions[0]: input - regions[1]: output - regions[2](I): filter - regions[3](I): bias - regions[4](O): filter_grad - regions[5](O): input_grad -*/ -PerDeviceOpState *Conv2D::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 4); - assert(task->regions.size() == 4); - // Conv2D const *conv = (Conv2D *)task->args; +static DeviceSpecific + init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { TaskArgumentAccessor acc(task, regions, ctx, runtime); - FFHandler handle = *((FFHandler const *)task->local_args); - auto const &attrs = acc.get_argument(ATTRS); - bool profiling = acc.get_argument(PROFILING); - // TensorAccessorR acc_input( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - // TensorAccessorW acc_output(regions[1], - // task->regions[1], - // FID_DATA, - // ctx, - // runtime, - // false - // /*readOutput*/); - // TensorAccessorR acc_kernel( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); - // TensorAccessorR acc_bias( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - // TensorAccessorW acc_kernel_grad( - // regions[3], - // task->regions[3], - // FID_DATA, - // ctx, - // runtime, - // false /*readOutput*/); - // TensorAccessorW acc_input_grad( - // regions[4], task->regions[4], FID_DATA, ctx, runtime, - // false/*readOutput*/); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto filter = acc.get_tensor(FILTER); - auto bias = acc.get_tensor(BIAS); - auto filter_grad = acc.get_tensor(FILTER_GRAD); - auto input_grad = acc.get_tensor(INPUT_GRAD); - - Conv2DPerDeviceState *m = new Conv2DPerDeviceState(handle); - m->relu = attrs.activation == AC_MODE_RELU; - m->use_bias = attrs.use_bias; - m->profiling = profiling; - // m->trainableInputs[0] = conv->trainableInputs[0]; ?? - std::strcpy(m->op_name, attrs.name); - - int input_w = input.shape[0]; - int input_h = input.shape[1]; - int input_c = input.shape[2]; - int input_n = input.shape[3]; - int output_w = output.shape[0]; - int output_h = output.shape[1]; - int output_c = output.shape[2]; - int output_n = output.shape[3]; - - printf("init conv (input): n(%d) c(%d) h(%d) w(%d)\n", - input_n, - input_c, - input_h, - input_w); - printf("init conv (output): n(%d) c(%d) h(%d) w(%d)\n", - output_n, - output_c, - output_h, - output_w); - - // printf("convDim: padding(%d %d) stride(%d %d)\n", conv->padding_h, - // conv->padding_w, conv->stride_h, conv->stride_w); - int pad_h = - ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2; - int pad_w = - ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2; - if (pad_h != attrs.padding_h) { - printf("Warning: changing conv_padding_h to satisfy output_h size\n"); - } - if (pad_w != attrs.padding_w) { - printf("Warning: changing conv_padding_w to satisfy output_w size\n"); - } - - init_kernel(m, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - attrs.kernel_h, - attrs.kernel_w, - attrs.groups, - attrs.stride_h, - attrs.stride_w, - pad_h, - pad_w, - input.get_float_ptr(), - output.get_float_ptr(), - filter.get_float_ptr(), - filter_grad.get_float_ptr()); - - return m; + return init_task_impl(acc); } -// TaskSpec Conv2D::get_tasks_spec() const { -// OpTasksSpec spec { -// CONV2D_INIT_TASK_ID, -// CONV2D_FWD_TASK_ID, -// CONV2D_BWD_TASK_ID -// }; -// auto &fwd = spec.get_fwd(); - -// fwd.add_input_slot(INPUT); -// fwd.add_param_slot(KERNEL); -// fwd.add_output_slot(OUTPUT); - -// auto input = spec.input_tensor(0); -// auto kernel = spec.param_tensor(0); -// auto bias = spec.param_tensor(1); -// auto output = spec.output_tensor(0); - -// fwd[INPUT] = input; -// fwd[KERNEL] = kernel; -// if (this->use_bias) { -// fwd[BIAS] = bias; -// } -// fwd[OUTPUT] = output; - -// return spec; -// } - -/* TaskSpec Conv2D::get_forward_task_spec() const { */ -/* TaskSpec spec = { CONV2D_FWD_TASK_ID, Pass::FWD }; */ - -/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ -/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ -/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ -/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ - -/* spec.add_input(INPUT, input); */ -/* spec.add_input(KERNEL, kernel); */ - -/* if (this->use_bias) { */ -/* spec.add_input(BIAS, bias); */ -/* } */ - -/* spec.add_output(OUTPUT, output); */ - -/* return spec; */ -/* } */ - -/* TaskSpec Conv2D::get_backward_task_spec() const { */ -/* TaskSpec spec = { CONV2D_BWD_TASK_ID, Pass::BWD }; */ +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); -/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ -/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ -/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ -/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ + auto input = acc.get_tensor(INPUT); + auto filter = acc.get_tensor(FILTER); + auto bias = acc.get_tensor(BIAS); + auto output = acc.get_tensor(OUTPUT); -/* spec.add_input(INPUT, input); */ -/* spec.add_output(INPUT_GRAD, input.grad); */ -/* spec.add_input(KERNEL, kernel); */ -/* spec.add_output(KERNEL_GRAD, kernel.grad); */ + return profile(forward_kernel, + profiling, + "[Conv2d] forward_time = %.2lfms\n", + &per_device_state, + input.get_float_ptr(), + output.get_float_ptr(), + filter.get_float_ptr(), + bias.get_float_ptr()); +} -/* if (this->use_bias) { */ -/* spec.add_input(BIAS, bias); */ -/* spec.add_output(BIAS_GRAD, bias.grad); */ -/* } */ +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); +} -/* spec.add_input(OUTPUT, output); */ -/* spec.add_input(OUTPUT_GRAD, output.grad); */ +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); -/* return spec; */ -/* } */ + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto filter = acc.get_tensor(FILTER); -void Conv2D::forward(FFModel const &ff) { - this->execute_task(ff, CONV2D_FWD_TASK_ID, get_fwd_task_signature()); -} + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto filter_grad = acc.get_tensor_grad(FILTER); + auto bias_grad = acc.get_tensor_grad(BIAS); -void Conv2D::backward(FFModel const &ff) { - this->execute_task(ff, CONV2D_bWD_TASK_ID, get_bwd_task_signature()); + return profile(backward_kernel, + profiling, + "[Conv2d] backward_time = %.2lfms\n", + &per_device_state, + input.get_float_ptr(), + input_grad.get_float_ptr(), + output.get_float_ptr(), + output_grad.get_float_ptr(), + filter.get_float_ptr(), + filter_grad.get_float_ptr(), + bias_grad.get_float_ptr()); } -/* - regions[0](I): input - regions[1](O): output - regions[2](I): filter - regions[3](I): bias -*/ -void Conv2D::forward_task(Task const *task, +static void backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Conv2DPerDeviceState const *m = *((Conv2DPerDeviceState **)task->local_args); - TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} - auto input = acc.get_tensor(INPUT); - auto filter = acc.get_tensor(FILTER); - auto bias = acc.get_tensor(BIAS); - auto output = acc.get_tensor(OUTPUT); +CostMetrics measure_operator_cost(SimEnvFactory const &sim, + Conv2DAttrs const &attrs, + InputParallelTensorDesc const &input_shape, + InputParallelTensorDesc const &filter_shape, + InputParallelTensorDesc const &bias_shape, + ProfilingSettings const &settings, + MachineView const &mv) { - // TensorAccessorR acc_input( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - // TensorAccessorW acc_output(regions[1], - // task->regions[1], - // FID_DATA, - // ctx, - // runtime, - // false - // /*readOutput*/); - // TensorAccessorR acc_kernel( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); - // float const *acc_bias_ptr = NULL; - // if (m->use_bias) { - // TensorAccessorR acc_bias( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - // acc_bias_ptr = acc_bias.ptr; - // } + auto env = sim.new_environment(); - profile(forward_kernel, - m->profiling, - "[Conv2d] forward_time = %.2lfms\n", - m, - input.get_float_ptr(), - output.get_float_ptr(), - filter.get_float_ptr(), - bias.get_float_ptr()); -} + ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape); -/* - region(I): input - region(I/O): input_grad (if trainableInputs[0]) - region(I): output - region(I/O): output_grad - region(I): filter - region(I/O): filter_grad - region(I/O): bias_grad (if use_bias) -*/ -void Conv2D::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // Conv2D* conv = (Conv2D*) task->args; - Conv2DPerDeviceState const *m = *((Conv2DPerDeviceState **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + - static_cast(m->use_bias))); - assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + - static_cast(m->use_bias))); - size_t rid = 0; - TensorAccessorR acc_input( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - float *acc_input_grad_ptr = NULL; - if (m->trainableInputs[0]) { - TensorAccessorW acc_input_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - acc_input_grad_ptr = acc_input_grad.ptr; - rid++; - } - TensorAccessorR acc_output( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - TensorAccessorW acc_output_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - TensorAccessorR acc_kernel( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - TensorAccessorW acc_kernel_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - float *acc_bias_grad_ptr = NULL; - if (m->use_bias) { - TensorAccessorW acc_bias_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); - rid++; - } - assert(rid == regions.size()); + SimTaskBinding init_binding; + init_binding.bind_arg(ATTRS, attrs); + init_binding.bind_arg(HANDLE, ff_handle()); - backward_kernel_wrapper(m, - acc_input.ptr, - acc_input_grad_ptr, - acc_output.ptr, - acc_output_grad.ptr, - acc_kernel.ptr, - acc_kernel_grad.ptr, - acc_bias_grad_ptr); -} + auto init_accessor = env.get_init_accessor(CONV2D_INIT_TASK_ID, init_binding); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); + + SimTaskBinding fwd_binding; + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); -bool Conv2D::estimate_sync_cost(Simulator *sim, - MachineView const &view, - CostMetrics &cost_metrics) const { - ParallelDim kernel_dims[MAX_TENSOR_DIM], bias_dims[MAX_TENSOR_DIM]; - int kernel_ndims, bias_ndims; + fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind(FILTER, filter_shape); + fwd_binding.bind(BIAS, bias_shape); - this->get_params().solve_dims(this->inputs[0]->get_shape(), - nullptr, - nullptr, - kernel_dims, - &kernel_ndims, - bias_dims, - &bias_ndims); + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - cost_metrics.sync_time = - sim->default_estimate_sync_cost(kernel_dims, kernel_ndims, view); + auto fwd_accessor = env.get_fwd_accessor(CONV2D_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(CONV2D_BWD_TASK_ID, bwd_binding); - if (this->use_bias) { - cost_metrics.sync_time += - sim->default_estimate_sync_cost(bias_dims, bias_ndims, view); - } + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); - return true; + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } -tl::optional Conv2D::as_dot() const { - RecordFormatter rr; - RecordFormatter r; +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT); - r << this->inputs[0]->get_shape().as_dot(); - r << "in_channels" << this->in_channels; - r << "out_channels" << this->out_channels; - r << "kernel_h" << this->kernel_h; - r << "kernel_w" << this->kernel_w; - r << "padding_h" << this->padding_h; - r << "padding_w" << this->padding_w; - r << "stride_h" << this->stride_h; - r << "stride_w" << this->stride_w; - r << this->outputs[0]->get_shape().as_dot(); - rr << r; + init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); - return rr; + register_task(CONV2D_INIT_TASK_ID, "Conv2D Init", init, init_task); } -bool Conv2D::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - int input_w = sub_input.dims[0].size; - int input_h = sub_input.dims[1].size; - int input_c = sub_input.dims[2].size; - int input_n = sub_input.dims[3].size; - int output_w = sub_output.dims[0].size; - int output_h = sub_output.dims[1].size; - int output_c = sub_output.dims[2].size; - int output_n = sub_output.dims[3].size; - int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; - int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - - Conv2DPerDeviceState *m = sim->conv2d_meta; - m->relu = activation == AC_MODE_RELU; - // require input_c is divisible by groups +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); - // allocate tensors in simulator - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + fwd.add_arg_slot(PROFILING); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + fwd.add_weight_slot(FILTER); + fwd.add_weight_slot(BIAS); - float *weight_ptr = (float *)sim->allocate( - (size_t)output_c * input_c * kernel_h * kernel_w / groups, DT_FLOAT); - assert(weight_ptr != NULL); - float *bias_ptr = (float *)sim->allocate(output_c, DT_FLOAT); - assert(bias_ptr != NULL); - cost_metrics.weights_memory += cost_metrics.total_mem_diff_from(sim->offset); + register_task(CONV2D_FWD_TASK_ID, "Conv2D Fwd", fwd, forward_task); +} - init_kernel(m, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - kernel_h, - kernel_w, - groups, - stride_h, - stride_w, - pad_h, - pad_w, - input_ptr, - output_ptr, - weight_ptr, - weight_ptr, // note we reuse weight_ptr for kernel_grad_ptr here - // to avoid allocating another tensor - &cost_metrics.forward_time, - &cost_metrics.backward_time); +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(CONV2D_FWD_TASK_ID)); - log_measure.debug("[Measure Conv2D] name(%s) input(%d %d %d %d) weight(%d %d " - "%d %d) output(%d %d %d %d) stride(%d %d) padding(%d %d) " - "forward_time(%.4lf) backward_time(%.4lf)\n", - name, - input_n, - input_c, - input_h, - input_w, - output_c, - input_c / groups, - kernel_h, - kernel_w, - output_n, - output_c, - output_h, - output_w, - stride_h, - stride_w, - padding_h, - padding_w, - cost_metrics.forward_time, - cost_metrics.backward_time); - return true; + register_task(CONV2D_BWD_TASK_ID, "Conv2D Bwd", bwd, backward_task); } -} // namespace FlexFlow +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/runtime/src/ops/conv_2d.h index 382538b70a..eefae82c4c 100644 --- a/lib/runtime/src/ops/conv_2d.h +++ b/lib/runtime/src/ops/conv_2d.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_CONV_2D_H #include "op-attrs/ops/conv_2d.h" -#include "op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -134,3 +134,692 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, } // namespace FlexFlow #endif + +// Tensor FFModel::conv2d(Tensor const &input, +// int outChannels, +// int kernelH, +// int kernelW, +// int strideH, +// int strideW, +// int paddingH, +// int paddingW, +// ActiMode activation, +// int groups, +// bool use_bias, +// Layer const *shared_op, +// Initializer *kernel_initializer, +// Initializer *bias_initializer, +// char const *name) { +// assert(input->num_dims() == 4); /*NCHW*/ + +// Conv2DAttrs attrs = {outChannels, +// kernelH, +// kernelW, +// strideH, +// strideW, +// paddingH, +// paddingW, +// groups, +// activation, +// use_bias}; + +// TensorShape output_shape = get_output_shape(attrs, input->get_shape()); +// Tensor output = this->tensor_mgr.create(output_shape, CreateGrad::YES, +// conv); + +// std::vector weights; + +// TensorShape kernel_shape = get_kernel_shape(attrs, input->get_shape()); +// weights.push_back(this->tensor_mgr.create( +// kernel_shape, CreateGrad::YES, kernel_initializer, CHOSEN_SYNC_TYPE)); + +// if (use_bias) { +// TensorShape bias_shape = get_bias_shape(attrs, input->get_shape()); +// weights.push_back(this->tensor_mgr.create( +// bias_shape, CreateGrad::YES, bias_initializer, CHOSEN_SYNC_TYPE)); +// } + +// Layer *conv = +// this->layer_mgr.create(attrs, DT_FLOAT, name, {input}, weights, +// {output}); + +// //{ +// // int numdims = 4; +// // int dims[MAX_TENSOR_DIM]; +// // dims[3] = input->dims[3]; +// // dims[2] = outChannels; +// // dims[1] = 1 + (input->dims[1] + 2 * paddingH - kernelH) / strideH; +// // dims[0] = 1 + (input->dims[0] + 2 * paddingW - kernelW) / strideW; +// // conv->outputs[0] = create_tensor_legion_ordering( +// // numdims, dims, DT_FLOAT, conv, 0, true /*create_grad*/); +// //} +// //{ +// // int dims[4] = {kernelW, kernelH, input->dims[2], outChannels}; +// // conv->weights[0] = create_weight_legion_ordering(4, +// // dims, +// // DT_FLOAT, +// // conv, +// // true /*create_grad*/, +// // kernel_initializer, +// // CHOSEN_SYNC_TYPE); +// //} +// // if (use_bias) { +// // int dims[1] = {outChannels}; +// // conv->weights[1] = create_weight_legion_ordering(1, +// // dims, +// // DT_FLOAT, +// // conv, +// // true /*create_grad*/, +// // bias_initializer, +// // CHOSEN_SYNC_TYPE); +// //} +// conv->add_initializer("kernel", kernel_initializer); +// conv->add_initializer("bias", bias_initializer); +// /* layers.push_back(conv); */ +// return conv->outputs[0]; +// } + +// Op *Conv2D::create_operator_from_layer( +// FFModel &model, +// Layer const *layer, +// std::vector const &inputs) { +// return new Conv2D(model, +// get(layer->attrs), +// inputs, +// layer->name, +// false /*allocate_weights*/ +// ); +// } + +/* void Conv2DParams::mark_replica_dims( */ +/* ParallelTensorShape const &input, */ +/* ParallelDim output_dims[MAX_TENSOR_DIM], */ +/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ +/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ +/* if (output_dims != nullptr) { */ +/* output_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ +/* } */ +/* if (kernel_dims != nullptr) { */ +/* kernel_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ +/* } */ +/* if (bias_dims != nullptr) { */ +/* bias_dims[Conv2DBias::REPLICA_1].is_replica_dim = true; */ +/* bias_dims[Conv2DBias::REPLICA_2].is_replica_dim = true; */ +/* bias_dims[Conv2DBias::REPLICA_3].is_replica_dim = true; */ +/* bias_dims[Conv2DBias::REPLICA_4].is_replica_dim = true; */ +/* } */ +/* } */ + +/* int Conv2DParams::output_size(ParallelTensorShape const &input, */ +/* ParallelDim output_dims[MAX_TENSOR_DIM]) const + * { */ +/* int input_w = input.dims[Conv2DInput::WIDTH].size; */ +/* int input_h = input.dims[Conv2DInput::HEIGHT].size; */ + +/* output_dims[Conv2DOutput::SAMPLE].size = + * input.dims[Conv2DInput::SAMPLE].size; */ +/* output_dims[Conv2DOutput::CHANNEL].size = out_channels; */ +/* output_dims[Conv2DOutput::HEIGHT].size = */ +/* 1 + (input_h + 2 * padding_h - kernel_h) / stride_h; */ +/* output_dims[Conv2DOutput::WIDTH].size = */ +/* 1 + (input_w + 2 * padding_w - kernel_w) / stride_w; */ + +/* return input.num_dims; */ +/* }; */ + +/* int Conv2DParams::kernel_size(ParallelTensorShape const &input, */ +/* ParallelDim kernel_dims[MAX_TENSOR_DIM]) const + * { */ +/* kernel_dims[Conv2DKernel::CHANNEL_OUT].size = this->out_channels; */ +/* kernel_dims[Conv2DKernel::CHANNEL_IN].size = */ +/* input.dims[Conv2DInput::CHANNEL].size / this->groups; */ +/* kernel_dims[Conv2DKernel::HEIGHT].size = */ +/* this->kernel_h * input.dims[Conv2DInput::HEIGHT].degree; */ +/* kernel_dims[Conv2DKernel::WIDTH].size = */ +/* this->kernel_w * input.dims[Conv2DInput::WIDTH].degree; */ + +/* return Conv2DKernel::NUMDIM; */ +/* } */ + +/* int Conv2DParams::bias_size(ParallelTensorShape const &input, */ +/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ +/* bias_dims[Conv2DBias::CHANNEL].size = this->out_channels; */ + +/* return Conv2DBias::NUMDIM; */ +/* }; */ + +/* void Conv2DParams::solve_dims(ParallelTensorShape const &input, */ +/* ParallelDim output_dims[MAX_TENSOR_DIM], */ +/* int *output_ndims, */ +/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ +/* int *kernel_ndims, */ +/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ +/* int *bias_ndims) const { */ +/* assert((output_dims == nullptr) == (output_ndims == nullptr)); */ +/* assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); */ +/* assert((bias_dims == nullptr) == (bias_ndims == nullptr)); */ + +/* std::vector mapping; */ +/* Conv2D::construct_mappings(mapping, this->use_bias); */ + +/* this->mark_replica_dims(input, output_dims, kernel_dims, bias_dims); */ + +/* std::vector output_dim_sets; */ +/* if (output_dims != nullptr) { */ +/* output_dim_sets.push_back(output_dims); */ +/* } */ + +/* std::vector weight_dim_sets; */ +/* if (kernel_dims != nullptr) { */ +/* weight_dim_sets.push_back(kernel_dims); */ +/* } */ +/* if (bias_dims != nullptr && this->use_bias) { */ +/* weight_dim_sets.push_back(bias_dims); */ +/* } */ + +/* solve_parallel_dim_mappings( */ +/* mapping, {input.dims}, weight_dim_sets, output_dim_sets); */ + +/* if (output_dims != nullptr) { */ +/* *output_ndims = this->output_size(input, output_dims); */ +/* } */ +/* if (kernel_dims != nullptr) { */ +/* *kernel_ndims = this->kernel_size(input, kernel_dims); */ +/* } */ +/* if (bias_dims != nullptr && this->use_bias) { */ +/* *bias_ndims = this->bias_size(input, bias_dims); */ +/* } */ +/* } */ + +/*static*/ +/* void Conv2D::construct_mappings(std::vector &out, + */ +/* bool use_bias) { */ +/* Conv2D::construct_output_mappings(out); */ +/* Conv2D::construct_weight_mappings(out, use_bias); */ +/* } */ + +/*static*/ +/* void Conv2D::construct_output_mappings( */ +/* std::vector &out) { */ +/* Op::construct_output_parallel_dims( */ +/* out, */ +/* {{Conv2DInput::CHANNEL, */ +/* MappingOperation::REPLICATE, */ +/* Conv2DOutput::REPLICA}, */ +/* {Conv2DInput::SAMPLE, MappingOperation::PARTITION, + * Conv2DOutput::SAMPLE}, */ +/* {Conv2DInput::REPLICA, */ +/* MappingOperation::PARTITION, */ +/* Conv2DOutput::CHANNEL}, */ +/* {Conv2DInput::HEIGHT, MappingOperation::PARTITION, + * Conv2DOutput::HEIGHT}, */ +/* {Conv2DInput::WIDTH, MappingOperation::PARTITION, + * Conv2DOutput::WIDTH}}); */ +/* } */ + +/*static*/ +/* void Conv2D::construct_weight_mappings( */ +/* std::vector &out, bool use_bias) { */ +/* Op::construct_weight_parallel_dims( */ +/* out, */ +/* { */ +/* {Conv2DInput::REPLICA, */ +/* MappingOperation::PARTITION, */ +/* Conv2DKernel::CHANNEL_OUT}, */ +/* {Conv2DInput::SAMPLE, */ +/* MappingOperation::REPLICATE, */ +/* Conv2DKernel::REPLICA}, */ +/* {Conv2DInput::CHANNEL, */ +/* MappingOperation::PARTITION, */ +/* Conv2DKernel::CHANNEL_IN}, */ +/* {Conv2DInput::HEIGHT, */ +/* MappingOperation::REPLICATE, */ +/* Conv2DKernel::HEIGHT}, // Kernel::{HEIGHT, WEIGHT} would both work + */ +/* // here */ +/* {Conv2DInput::WIDTH, */ +/* MappingOperation::REPLICATE, */ +/* Conv2DKernel::WIDTH}, // same as above */ +/* }, */ +/* Conv2DInput::INDEX, */ +/* Conv2DKernel::INDEX); */ + +/* if (use_bias) { */ +/* Op::construct_weight_parallel_dims( */ +/* out, */ +/* {{Conv2DInput::REPLICA, Conv2DBias::REPLICA_1}, */ +/* {Conv2DInput::SAMPLE, Conv2DBias::REPLICA_2}, */ +/* {Conv2DInput::CHANNEL, Conv2DBias::CHANNEL}, */ +/* {Conv2DInput::HEIGHT, Conv2DBias::REPLICA_3}, */ +/* {Conv2DInput::WIDTH, Conv2DBias::REPLICA_4}}, */ +/* Conv2DInput::INDEX, */ +/* Conv2DBias::INDEX); */ +/* } */ +/* } */ + +// Conv2D::Conv2D(FFModel &model, +// Conv2D const &other, +// const ParallelTensor input, +// bool allocate_weights) +// : Conv2D(model, +// other.layer_guid, +// input, +// other.out_channels, +// other.kernel_h, +// other.kernel_w, +// other.stride_h, +// other.stride_w, +// other.padding_h, +// other.padding_w, +// other.activation, +// other.groups, +// other.use_bias, +// allocate_weights, +// other.name) {} + +// Conv2D::Conv2D(FFModel &model, +// Conv2DAttrs const &attrs, +// std::vector const &inputs, +// char const *name, +// bool allocate_weights) +// : Conv2D(model, +// params.layer_guid, +// input, +// params.out_channels, +// params.kernel_h, +// params.kernel_w, +// params.stride_h, +// params.stride_w, +// params.padding_h, +// params.padding_w, +// params.activation, +// params.groups, +// params.use_bias, +// allocate_weights, +// name) {} + +/* bool Conv2DParams::is_valid(ParallelTensorShape const &input) const { */ +/* ParallelTensorShape output_shape, kernel_shape, bias_shape; */ +/* this->solve_dims(input, */ +/* output_shape.dims, */ +/* &output_shape.num_dims, */ +/* kernel_shape.dims, */ +/* &kernel_shape.num_dims, */ +/* bias_shape.dims, */ +/* &bias_shape.num_dims); */ +/* bool is_valid = true; */ +/* is_valid &= input.is_valid(); */ +/* is_valid &= output_shape.is_valid(); */ +/* is_valid &= kernel_shape.is_valid(); */ +/* if (use_bias) { */ +/* is_valid &= bias_shape.is_valid(); */ +/* } */ + +/* // TODO FIXME: Currently disable parallelizing the height and width + * dimension */ +/* if (input.dims[0].degree > 1 || input.dims[1].degree > 1) { */ +/* return false; */ +/* } */ + +/* return is_valid; */ +/* } */ + +// Conv2D::Conv2D(FFModel &model, +// LayerID const &_layer_guid, +// const ParallelTensor input, +// int outChannels, +// int kernelH, +// int kernelW, +// int strideH, +// int strideW, +// int paddingH, +// int paddingW, +// ActiMode activation, +// int groups, +// bool use_bias, +// bool allocate_weights, +// char const *name) +// : Op(model, +// OP_CONV2D, +// DT_FLOAT, +// name, +// 1 /*inputs*/, +// use_bias ? 2 : 1 /*weights*/, +// allocate_weights, +// 1 /*outputs*/, +// input), +// in_channels(input->dims[Conv2DInput::CHANNEL].size / +// input->dims[Conv2DInput::CHANNEL].degree), +// out_channels(outChannels), kernel_h(kernelH), kernel_w(kernelW), +// stride_h(strideH), stride_w(strideW), padding_h(paddingH), +// padding_w(paddingW), activation(activation), groups(groups), +// use_bias(use_bias) { +// // overwrite layer_guid +// layer_guid = _layer_guid; +// assert(input->num_dims == Conv2DInput::NUMDIM); +// assert(this->stride_h > 0); +// assert(this->stride_w > 0); + +// ParallelDim output_dims[MAX_TENSOR_DIM], kernel_dims[MAX_TENSOR_DIM], +// bias_dims[MAX_TENSOR_DIM]; +// int output_ndims, kernel_ndims, bias_ndims; + +// this->construct_mappings(*this->parallel_dims_mapping, this->use_bias); +// this->get_params().solve_dims(this->inputs[0]->get_shape(), +// output_dims, +// &output_ndims, +// kernel_dims, +// &kernel_ndims, +// bias_dims, +// &bias_ndims); + +// if (allocate_weights) { +// Initializer *kernel_initializer = new GlorotUniform(std::rand() +// /*seed*/); + +// weights[Conv2DKernel::INDEX] = +// model.create_parallel_weight_legion_ordering(kernel_ndims, +// kernel_dims, +// DT_FLOAT, +// NULL /*owner_op*/, +// true /*create_grad*/, +// kernel_initializer, +// CHOSEN_SYNC_TYPE); + +// if (use_bias) { +// Initializer *bias_initializer = new ZeroInitializer(); + +// weights[Conv2DBias::INDEX] = +// model.create_parallel_weight_legion_ordering(bias_ndims, +// bias_dims, +// DT_FLOAT, +// NULL /*owner_op*/, +// true /*create_grad*/, +// bias_initializer, +// CHOSEN_SYNC_TYPE); +// } +// } + +// outputs[0] = model.create_parallel_tensor_legion_ordering( +// output_ndims, output_dims, DT_FLOAT, this); + +// assert(check_output_input_weight_parallel_dims(allocate_weights)); +// } + +// tl::optional Conv2D::as_dot() const { +// RecordFormatter rr; +// RecordFormatter r; + +// r << this->inputs[0]->get_shape().as_dot(); +// r << "in_channels" << this->in_channels; +// r << "out_channels" << this->out_channels; +// r << "kernel_h" << this->kernel_h; +// r << "kernel_w" << this->kernel_w; +// r << "padding_h" << this->padding_h; +// r << "padding_w" << this->padding_w; +// r << "stride_h" << this->stride_h; +// r << "stride_w" << this->stride_w; +// r << this->outputs[0]->get_shape().as_dot(); +// rr << r; + +// return rr; +// } + +// bool Conv2D::estimate_sync_cost(Simulator *sim, +// MachineView const &view, +// CostMetrics &cost_metrics) const { +// ParallelDim kernel_dims[MAX_TENSOR_DIM], bias_dims[MAX_TENSOR_DIM]; +// int kernel_ndims, bias_ndims; + +// this->get_params().solve_dims(this->inputs[0]->get_shape(), +// nullptr, +// nullptr, +// kernel_dims, +// &kernel_ndims, +// bias_dims, +// &bias_ndims); + +// cost_metrics.sync_time = +// sim->default_estimate_sync_cost(kernel_dims, kernel_ndims, view); + +// if (this->use_bias) { +// cost_metrics.sync_time += +// sim->default_estimate_sync_cost(bias_dims, bias_ndims, view); +// } + +// return true; +// } + +/* + region(I): input + region(I/O): input_grad (if trainableInputs[0]) + region(I): output + region(I/O): output_grad + region(I): filter + region(I/O): filter_grad + region(I/O): bias_grad (if use_bias) +*/ + +/* + regions[0](I): input + regions[1](O): output + regions[2](I): filter + regions[3](I): bias +*/ + +// void Conv2D::forward(FFModel const &ff) { +// this->execute_task(ff, CONV2D_FWD_TASK_ID, get_fwd_task_signature()); +// } + +// void Conv2D::backward(FFModel const &ff) { +// this->execute_task(ff, CONV2D_bWD_TASK_ID, get_bwd_task_signature()); +// } + +// TaskSpec Conv2D::get_tasks_spec() const { +// OpTasksSpec spec { +// CONV2D_INIT_TASK_ID, +// CONV2D_FWD_TASK_ID, +// CONV2D_BWD_TASK_ID +// }; +// auto &fwd = spec.get_fwd(); + +// fwd.add_input_slot(INPUT); +// fwd.add_param_slot(KERNEL); +// fwd.add_output_slot(OUTPUT); + +// auto input = spec.input_tensor(0); +// auto kernel = spec.param_tensor(0); +// auto bias = spec.param_tensor(1); +// auto output = spec.output_tensor(0); + +// fwd[INPUT] = input; +// fwd[KERNEL] = kernel; +// if (this->use_bias) { +// fwd[BIAS] = bias; +// } +// fwd[OUTPUT] = output; + +// return spec; +// } + +/* TaskSpec Conv2D::get_forward_task_spec() const { */ +/* TaskSpec spec = { CONV2D_FWD_TASK_ID, Pass::FWD }; */ + +/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ +/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ +/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ +/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ + +/* spec.add_input(INPUT, input); */ +/* spec.add_input(KERNEL, kernel); */ + +/* if (this->use_bias) { */ +/* spec.add_input(BIAS, bias); */ +/* } */ + +/* spec.add_output(OUTPUT, output); */ + +/* return spec; */ +/* } */ + +/* TaskSpec Conv2D::get_backward_task_spec() const { */ +/* TaskSpec spec = { CONV2D_BWD_TASK_ID, Pass::BWD }; */ + +/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ +/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ +/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ +/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ + +/* spec.add_input(INPUT, input); */ +/* spec.add_output(INPUT_GRAD, input.grad); */ +/* spec.add_input(KERNEL, kernel); */ +/* spec.add_output(KERNEL_GRAD, kernel.grad); */ + +/* if (this->use_bias) { */ +/* spec.add_input(BIAS, bias); */ +/* spec.add_output(BIAS_GRAD, bias.grad); */ +/* } */ + +/* spec.add_input(OUTPUT, output); */ +/* spec.add_input(OUTPUT_GRAD, output.grad); */ + +/* return spec; */ +/* } */ + +/* + regions[0]: input + regions[1]: output + regions[2](I): filter + regions[3](I): bias + regions[4](O): filter_grad + regions[5](O): input_grad +*/ + +// void Conv2D::init(FFModel const &ff) { +// this->execute_task(ff, CONV2D_INIT_TASK_ID, get_init_task_signature()); +// assert(check_output_input_weight_same_parallel_is()); +// parallel_is = outputs[0]->parallel_is; +// ArgumentMap argmap; +// Context ctx = ff.config.lg_ctx; +// Runtime *runtime = ff.config.lg_hlr; +// set_argumentmap_for_init(ff, argmap); +// IndexLauncher launcher(CONV2D_INIT_TASK_ID, +// parallel_is, +// TaskArgument(this, sizeof(Conv2D)), +// argmap, +// Predicate::TRUE_PRED, +// false /*must*/, +// 0 /*mapper_id*/, +// outputs[0]->machine_view.hash()); +// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// inputs[0]->region)); +// launcher.add_field(0, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// outputs[0]->region)); +// launcher.add_field(1, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(weights[0]->part, +// 0 /*projection id*/, +// READ_ONLY, +// EXCLUSIVE, +// weights[0]->region)); +// launcher.add_field(2, FID_DATA); +// // launcher.add_region_requirement( +// // RegionRequirement(weights[1]->part, 0/*projection id*/, +// // READ_ONLY, EXCLUSIVE, weights[1]->region)); +// // launcher.add_field(3, FID_DATA); +// launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, +// 0 /*projection id*/, +// WRITE_ONLY, +// EXCLUSIVE, +// weights[0]->region_grad)); +// launcher.add_field(3, FID_DATA); +// // launcher.add_region_requirement( +// // RegionRequirement(inputs[0]->part_grad, 0/*projection id*/, +// // WRITE_ONLY, EXCLUSIVE, inputs[0]->region_grad)); +// // launcher.add_field(4, FID_DATA); +// FutureMap fm = runtime->execute_index_space(ctx, launcher); +// fm.wait_all_results(); +// set_opmeta_from_futuremap(ff, fm); +// } + +// printf("init conv (input): n(%d) c(%d) h(%d) w(%d)\n", +// input_n, +// input_c, +// input_h, +// input_w); +// printf("init conv (output): n(%d) c(%d) h(%d) w(%d)\n", +// output_n, +// output_c, +// output_h, +// output_w); + +// printf("convDim: padding(%d %d) stride(%d %d)\n", conv->padding_h, +// conv->padding_w, conv->stride_h, conv->stride_w); +// int pad_h = +// ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2; +// int pad_w = +// ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2; +// if (pad_h != attrs.padding_h) { +// printf("Warning: changing conv_padding_h to satisfy output_h size\n"); +// } +// if (pad_w != attrs.padding_w) { +// printf("Warning: changing conv_padding_w to satisfy output_w size\n"); +// } + +// size_t rid = 0; +// TensorAccessorR acc_input( +// regions[rid], task->regions[rid], FID_DATA, ctx, runtime); +// rid++; +// float *acc_input_grad_ptr = NULL; +// if (m->trainableInputs[0]) { +// TensorAccessorW acc_input_grad( +// regions[rid], +// task->regions[rid], +// FID_DATA, +// ctx, +// runtime, +// true /*readOutput*/); +// acc_input_grad_ptr = acc_input_grad.ptr; +// rid++; +// } +// TensorAccessorR acc_output( +// regions[rid], task->regions[rid], FID_DATA, ctx, runtime); +// rid++; +// TensorAccessorW acc_output_grad( +// regions[rid], +// task->regions[rid], +// FID_DATA, +// ctx, +// runtime, +// true /*readOutput*/); +// rid++; +// TensorAccessorR acc_kernel( +// regions[rid], task->regions[rid], FID_DATA, ctx, runtime); +// rid++; +// TensorAccessorW acc_kernel_grad( +// regions[rid], +// task->regions[rid], +// FID_DATA, +// ctx, +// runtime, +// true /*readOutput*/); +// rid++; +// float *acc_bias_grad_ptr = NULL; +// if (m->use_bias) { +// TensorAccessorW acc_bias_grad( +// regions[rid], +// task->regions[rid], +// FID_DATA, +// ctx, +// runtime, +// true /*readOutput*/); +// acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); +// rid++; +// } \ No newline at end of file From def4807787af0427ecf4cfc893015dabf9e80b30 Mon Sep 17 00:00:00 2001 From: Kate Unger Date: Sun, 10 Sep 2023 11:44:02 -0700 Subject: [PATCH 2/5] conv 2d cuda --- lib/kernels/include/kernels/accessor.h | 4 +- lib/kernels/include/kernels/conv_2d_kernels.h | 41 +- lib/kernels/include/kernels/device.h | 4 +- lib/kernels/include/kernels/ff_handle.h | 7 + lib/kernels/src/cuda/conv_2d_kernels.cu | 419 ++++----- lib/kernels/src/device.h | 6 +- lib/kernels/src/hip/conv_2d_kernels.cpp | 312 +++---- lib/runtime/src/ops/conv_2d.cc | 45 +- lib/runtime/src/ops/conv_2d.h | 796 ------------------ 9 files changed, 442 insertions(+), 1192 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 15369b2e13..32600fca8f 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -34,7 +34,7 @@ class GenericTensorAccessorW { ArrayShape shape; req ptr; }; -FF_VISITABLE_STRUCT(GenericTensorAccessorW, data_type, shape, ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, data_type, shape, ptr); class GenericTensorAccessorR { public: @@ -59,7 +59,7 @@ class GenericTensorAccessorR { ArrayShape shape; req ptr; }; -FF_VISITABLE_STRUCT(GenericTensorAccessorR, data_type, shape, ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, data_type, shape, ptr); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index b327a2d2cd..a133f9e151 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -2,11 +2,18 @@ #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "op-attrs/activation.h" +#include "utils/visitable.h" +#include "kernels/accessor.h" namespace FlexFlow { struct Conv2DPerDeviceState { PerDeviceFFHandle handle; + optional activation; + bool use_bias; + ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t biasTensor; ffTensorDescriptor_t outputTensor; @@ -16,12 +23,12 @@ struct Conv2DPerDeviceState { ffConvolutionFwdAlgo_t fwdAlgo; ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; ffConvolutionBwdDataAlgo_t bwdDataAlgo; - req> activation; - req use_bias; }; -FF_VISITABLE_STRUCT_NO_EQ(Conv2DPerDeviceState, +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, handle, + activation, + use_bias, inputTensor, biasTensor, outputTensor, @@ -30,25 +37,25 @@ FF_VISITABLE_STRUCT_NO_EQ(Conv2DPerDeviceState, convDesc, fwdAlgo, bwdFilterAlgo, - bwdDataAlgo, - activation, - use_bias); + bwdDataAlgo); namespace Kernels { namespace Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - ffTensorDescriptor_t inputTensor, - ffTensorDescriptor_t biasTensor, - ffTensorDescriptor_t outputTensor, - ffFilterDescriptor_t filterDesc, - ffActivationDescriptor_t actiDesc, - ffConvolutionDescriptor_t convDesc, - ffConvolutionFwdAlgo_t fwdAlgo, - ffConvolutionBwdFilterAlgo_t bwdFilterAlgo, - ffConvolutionBwdDataAlgo_t bwdDataAlgo, - req> relu, - bool use_bias); + optional activation, + bool use_bias, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr); void forward_kernel(ffStream_t stream, Conv2DPerDeviceState const *m, diff --git a/lib/kernels/include/kernels/device.h b/lib/kernels/include/kernels/device.h index 663f5c2b3f..b1571da1b3 100644 --- a/lib/kernels/include/kernels/device.h +++ b/lib/kernels/include/kernels/device.h @@ -25,8 +25,6 @@ #include #include -namespace FlexFlow { - #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) typedef cudaStream_t ffStream_t; cudaError_t get_legion_stream(cudaStream_t *stream); @@ -79,6 +77,8 @@ typedef hipError_t ffError_t; #error "Unknown device" #endif +namespace FlexFlow { + #define FatalError(s) \ do { \ std::stringstream _where, _message; \ diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 9df2f376e2..df99d96f51 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -6,6 +6,7 @@ #endif #include "kernels/device.h" +#include "utils/visitable.h" namespace FlexFlow { @@ -22,6 +23,12 @@ struct PerDeviceFFHandle { #endif }; +#ifdef FF_USE_NCCL +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, dnn, blas, workSpace, workSpaceSize, allowTensorOpMathConversion, ncclComm); +#else +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, dnn, blas, workSpace, workSpaceSize, allowTensorOpMathConversion); +#endif + } // namespace FlexFlow #endif diff --git a/lib/kernels/src/cuda/conv_2d_kernels.cu b/lib/kernels/src/cuda/conv_2d_kernels.cu index 265954aa17..aa4365c096 100644 --- a/lib/kernels/src/cuda/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/conv_2d_kernels.cu @@ -1,44 +1,172 @@ #include "kernels/conv_2d_kernels.h" -#include "kernels/cuda_helper.h" +#include "kernels/device.h" +#include "device.h" namespace FlexFlow { +namespace Kernels { +namespace Conv2D { + + +cudnnConvolutionBwdDataAlgo_t selectConvolutionBackwardDataAlgorithm( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + void const *w, + const cudnnTensorDescriptor_t dyDesc, + void const *dy, + const cudnnConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + cudnnConvolutionBwdDataAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(cudnnFindConvolutionBackwardDataAlgorithmEx(handle, + wDesc, + w, + dyDesc, + dy, + convDesc, + dxDesc, + dx, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize)); + assert(cnt > 0); + checkCUDNN(perfResults[0].status); + printf("bwdDataAlgo(%d) time(%.2lf)\n", + perfResults[0].algo, + perfResults[0].time); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].algo; +} + +cudnnConvolutionFwdAlgo_t selectConvolutionForwardAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + void const *x, + const cudnnFilterDescriptor_t wDesc, + void const *w, + const cudnnConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const cudnnTensorDescriptor_t yDesc, + void *y, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + cudnnConvolutionFwdAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(handle, + xDesc, + x, + wDesc, + w, + convDesc, + yDesc, + y, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize)); + assert(cnt > 0); + checkCUDNN(perfResults[0].status); + printf("forwardAlgo(%d) time(%.2lf)\n", + perfResults[0].algo, + perfResults[0].time); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].algo; +} + +cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + void const *x, + const cudnnTensorDescriptor_t dyDesc, + void const *dy, + const cudnnConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + cudnnConvolutionBwdFilterAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(cudnnFindConvolutionBackwardFilterAlgorithmEx(handle, + xDesc, + x, + dyDesc, + dy, + convDesc, + dwDesc, + dw, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize)); + assert(cnt > 0); + checkCUDNN(perfResults[0].status); + printf("bwdFilterAlgo(%d) time(%.2lf)\n", + perfResults[0].algo, + perfResults[0].time); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].algo; +} + +Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + optional activation, + bool use_bias, + int kernel_h, + int kernel_w, + int groups, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { + + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t biasTensor; + ffTensorDescriptor_t outputTensor; + ffFilterDescriptor_t filterDesc; + ffActivationDescriptor_t actiDesc; + ffConvolutionDescriptor_t convDesc; + ffConvolutionFwdAlgo_t fwdAlgo; + ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; + ffConvolutionBwdDataAlgo_t bwdDataAlgo; + + int input_w = input.shape[legion_dim_t(0)]; + int input_h = input.shape[legion_dim_t(1)]; + int input_c = input.shape[legion_dim_t(2)]; + int input_n = input.shape[legion_dim_t(3)]; + + int output_w = output.shape[legion_dim_t(0)]; + int output_h = output.shape[legion_dim_t(1)]; + int output_c = output.shape[legion_dim_t(2)]; + int output_n = output.shape[legion_dim_t(3)]; -Conv2DPerDeviceState::Conv2DPerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc)); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); -} -namespace Kernels { -namespace Conv2D { - -void init_kernel(Conv2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int kernel_h, - int kernel_w, - int groups, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - float const *input_ptr, - float *output_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *forward_time, - float *backward_time) { - checkCUDNN(cudnnSetTensor4dDescriptor(m->inputTensor, + checkCUDNN(cudnnSetTensor4dDescriptor(inputTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, input_n, @@ -47,7 +175,7 @@ void init_kernel(Conv2DPerDeviceState *m, input_w)); checkCUDNN(cudnnSetTensor4dDescriptor( - m->biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1)); + biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1)); // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); @@ -56,7 +184,7 @@ void init_kernel(Conv2DPerDeviceState *m, kernel_w, input_c / groups, output_c); - checkCUDNN(cudnnSetFilter4dDescriptor(m->filterDesc, + checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, output_c, @@ -64,7 +192,7 @@ void init_kernel(Conv2DPerDeviceState *m, kernel_h, kernel_w)); - checkCUDNN(cudnnSetConvolution2dDescriptor(m->convDesc, + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, pad_h, // conv->padding_h, pad_w, // conv->padding_w, stride_h, @@ -74,83 +202,87 @@ void init_kernel(Conv2DPerDeviceState *m, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); if (groups != 1) { - checkCUDNN(cudnnSetConvolutionGroupCount(m->convDesc, groups)); + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, groups)); } // enable tensor core when possible - if (m->handle.allowTensorOpMathConversion) { + if (handle.allowTensorOpMathConversion) { checkCUDNN(cudnnSetConvolutionMathType( - m->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); + convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); } else { - checkCUDNN(cudnnSetConvolutionMathType(m->convDesc, CUDNN_TENSOR_OP_MATH)); + checkCUDNN(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH)); } int n, c, h, w; checkCUDNN(cudnnGetConvolution2dForwardOutputDim( - m->convDesc, m->inputTensor, m->filterDesc, &n, &c, &h, &w)); + convDesc, inputTensor, filterDesc, &n, &c, &h, &w)); assert(n == output_n); assert(c == output_c); assert(h == output_h); assert(w == output_w); checkCUDNN(cudnnSetTensor4dDescriptor( - m->outputTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); + outputTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); float time; // select forward algorithm - m->fwdAlgo = selectConvolutionForwardAlgorithm(m->handle.dnn, - m->inputTensor, - input_ptr, - m->filterDesc, - kernel_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->outputTensor, - output_ptr, + fwdAlgo = selectConvolutionForwardAlgorithm(handle.dnn, + inputTensor, + input.get_float_ptr(), + filterDesc, + filter_ptr, + convDesc, + handle.workSpace, + handle.workSpaceSize, + outputTensor, + output.get_float_ptr(), &time); - if (forward_time != nullptr) { - *forward_time += time; - } // select backward filter algorithm - m->bwdFilterAlgo = - selectConvolutionBackwardFilterAlgorithm(m->handle.dnn, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->filterDesc, - kernel_grad_ptr, + bwdFilterAlgo = + selectConvolutionBackwardFilterAlgorithm(handle.dnn, + inputTensor, + input.get_float_ptr(), + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + filterDesc, + filter_grad_ptr, &time); - if (backward_time != nullptr) { - *backward_time += time; - } // select backward data algorithm - m->bwdDataAlgo = - selectConvolutionBackwardDataAlgorithm(m->handle.dnn, - m->filterDesc, - kernel_ptr, - m->outputTensor, - output_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->inputTensor, - (float *)input_ptr, + bwdDataAlgo = + selectConvolutionBackwardDataAlgorithm(handle.dnn, + filterDesc, + filter_ptr, + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + inputTensor, + input.get_float_ptr(), &time); - if (backward_time != nullptr) { - *backward_time += time; - } - - if (m->relu) { + if (activation.has_value()) { checkCUDNN(cudnnSetActivationDescriptor( - m->actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); + actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); } + + Conv2DPerDeviceState per_device_state = {handle, + activation, + use_bias, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo}; + return per_device_state; } void forward_kernel(cudaStream_t stream, @@ -186,7 +318,7 @@ void forward_kernel(cudaStream_t stream, m->outputTensor, output_ptr)); } - if (m->relu) { + if (m->activation) { checkCUDNN(cudnnActivationForward(m->handle.dnn, m->actiDesc, &alpha, @@ -211,7 +343,7 @@ void backward_kernel(cudaStream_t stream, float alpha = 1.0f; // float beta = 0.0f; - if (m->relu) { + if (m->activation) { cudnnDataType_t dataType; int n, c, h, w, nStride, cStride, hStride, wStride; checkCUDNN(cudnnGetTensor4dDescriptor(m->outputTensor, @@ -272,123 +404,6 @@ void backward_kernel(cudaStream_t stream, } } -cudnnConvolutionFwdAlgo_t selectConvolutionForwardAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - void const *x, - const cudnnFilterDescriptor_t wDesc, - void const *w, - const cudnnConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const cudnnTensorDescriptor_t yDesc, - void *y, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - cudnnConvolutionFwdAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(handle, - xDesc, - x, - wDesc, - w, - convDesc, - yDesc, - y, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize)); - assert(cnt > 0); - checkCUDNN(perfResults[0].status); - printf("forwardAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].algo; -} - -cudnnConvolutionBwdDataAlgo_t selectConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - void const *w, - const cudnnTensorDescriptor_t dyDesc, - void const *dy, - const cudnnConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - cudnnConvolutionBwdDataAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(cudnnFindConvolutionBackwardDataAlgorithmEx(handle, - wDesc, - w, - dyDesc, - dy, - convDesc, - dxDesc, - dx, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize)); - assert(cnt > 0); - checkCUDNN(perfResults[0].status); - printf("bwdDataAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].algo; -} - -cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - void const *x, - const cudnnTensorDescriptor_t dyDesc, - void const *dy, - const cudnnConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const cudnnFilterDescriptor_t dwDesc, - void *dw, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - cudnnConvolutionBwdFilterAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(cudnnFindConvolutionBackwardFilterAlgorithmEx(handle, - xDesc, - x, - dyDesc, - dy, - convDesc, - dwDesc, - dw, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize)); - assert(cnt > 0); - checkCUDNN(perfResults[0].status); - printf("bwdFilterAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].algo; -} - } // namespace Conv2D } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index dd3226ce5a..5c4239a5cf 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -3,7 +3,8 @@ #include "kernels/array_shape.h" #include "kernels/device.h" -#include "op-attrs/ffconst.h" +#include "op-attrs/datatype.h" +#include "op-attrs/op.h" #if defined(FF_USE_CUDA) #include @@ -23,6 +24,9 @@ #error "Unknown device" #endif +using ::FlexFlow::DataType; +using ::FlexFlow::OperatorType; + #define checkCUDNN(status) \ do { \ std::stringstream _error; \ diff --git a/lib/kernels/src/hip/conv_2d_kernels.cpp b/lib/kernels/src/hip/conv_2d_kernels.cpp index 9c6a7f15ae..78a7e0f4d4 100644 --- a/lib/kernels/src/hip/conv_2d_kernels.cpp +++ b/lib/kernels/src/hip/conv_2d_kernels.cpp @@ -14,51 +14,179 @@ */ #include "kernels/conv_2d_kernels.h" -#include "kernels/hip_helper.h" -#include namespace FlexFlow { +namespace Kernels { +namespace Conv2D { + + +miopenConvFwdAlgorithm_t selectConvolutionForwardAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + void const *x, + const miopenTensorDescriptor_t wDesc, + void const *w, + const miopenConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const miopenTensorDescriptor_t yDesc, + void *y, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + miopenConvAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(miopenFindConvolutionForwardAlgorithm(handle, + xDesc, + x, + wDesc, + w, + convDesc, + yDesc, + y, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize, + false)); + assert(cnt > 0); + // checkCUDNN(perfResults[0].status); + printf("forwardAlgo(%d) time(%.2lf)\n", + perfResults[0].fwd_algo, + perfResults[0].time); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].fwd_algo; +} + +miopenConvBwdWeightsAlgorithm_t selectConvolutionBackwardFilterAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + void const *x, + const miopenTensorDescriptor_t dyDesc, + void const *dy, + const miopenConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const miopenTensorDescriptor_t dwDesc, + void *dw, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + miopenConvAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(miopenFindConvolutionBackwardWeightsAlgorithm(handle, + dyDesc, + dy, + xDesc, + x, + convDesc, + dwDesc, + dw, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize, + false)); + assert(cnt > 0); + // checkCUDNN(perfResults[0].status); + printf("bwdFilterAlgo(%d) time(%.2lf)\n", + perfResults[0].bwd_weights_algo, + perfResults[0].time); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].bwd_weights_algo; +} + +miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t wDesc, + void const *w, + const miopenTensorDescriptor_t dyDesc, + void const *dy, + const miopenConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const miopenTensorDescriptor_t dxDesc, + void *dx, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + miopenConvAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(miopenFindConvolutionBackwardDataAlgorithm(handle, + dyDesc, + dy, + wDesc, + w, + convDesc, + dxDesc, + dx, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize, + false)); + assert(cnt > 0); + // checkCUDNN(perfResults[0].status); + printf("bwdDataAlgo(%d) time(%.2lf)\n", + perfResults[0].bwd_data_algo, + perfResults[0].time); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].bwd_data_algo; +} + +Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + optional activation, + bool use_bias, + int kernel_h, + int kernel_w, + int groups, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { + + miopenCreateTensorDescriptor inputTensor; + miopenCreateTensorDescriptor biasTensor; + miopenCreateTensorDescriptor outputTensor; + miopenCreateTensorDescriptor filterDesc; + miopenActivationDescriptor_t actiDesc; + miopenActivationDescriptor_t convDesc; + miopenConvFwdAlgorithm_t fwdAlgo; + miopenConvBwdWeightsAlgorithm_t bwdFilterAlgo; + miopenConvBwdDataAlgorithm_t bwdDataAlgo; + + int input_w = input.shape[legion_dim_t(0)]; + int input_h = input.shape[legion_dim_t(1)]; + int input_c = input.shape[legion_dim_t(2)]; + int input_n = input.shape[legion_dim_t(3)]; + + int output_w = output.shape[legion_dim_t(0)]; + int output_h = output.shape[legion_dim_t(1)]; + int output_c = output.shape[legion_dim_t(2)]; + int output_n = output.shape[legion_dim_t(3)]; -Conv2DPerDeviceState::Conv2DPerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&filterDesc)); checkCUDNN(miopenCreateConvolutionDescriptor(&convDesc)); checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); -} - -namespace Kernels { -namespace Conv2D { -void init_kernel(Conv2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int kernel_h, - int kernel_w, - int groups, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - float const *input_ptr, - float *output_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *forward_time, - float *backward_time) { checkCUDNN(miopenSet4dTensorDescriptor( - m->inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); + inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); checkCUDNN(miopenSet4dTensorDescriptor( - m->biasTensor, miopenFloat, 1, output_c, 1, 1)); + biasTensor, miopenFloat, 1, output_c, 1, 1)); // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); @@ -289,126 +417,6 @@ void backward_kernel(hipStream_t stream, } } -miopenConvFwdAlgorithm_t selectConvolutionForwardAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t xDesc, - void const *x, - const miopenTensorDescriptor_t wDesc, - void const *w, - const miopenConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const miopenTensorDescriptor_t yDesc, - void *y, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - miopenConvAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(miopenFindConvolutionForwardAlgorithm(handle, - xDesc, - x, - wDesc, - w, - convDesc, - yDesc, - y, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize, - false)); - assert(cnt > 0); - // checkCUDNN(perfResults[0].status); - printf("forwardAlgo(%d) time(%.2lf)\n", - perfResults[0].fwd_algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].fwd_algo; -} - -miopenConvBwdWeightsAlgorithm_t selectConvolutionBackwardFilterAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t xDesc, - void const *x, - const miopenTensorDescriptor_t dyDesc, - void const *dy, - const miopenConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const miopenTensorDescriptor_t dwDesc, - void *dw, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - miopenConvAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(miopenFindConvolutionBackwardWeightsAlgorithm(handle, - dyDesc, - dy, - xDesc, - x, - convDesc, - dwDesc, - dw, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize, - false)); - assert(cnt > 0); - // checkCUDNN(perfResults[0].status); - printf("bwdFilterAlgo(%d) time(%.2lf)\n", - perfResults[0].bwd_weights_algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].bwd_weights_algo; -} - -miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t wDesc, - void const *w, - const miopenTensorDescriptor_t dyDesc, - void const *dy, - const miopenConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const miopenTensorDescriptor_t dxDesc, - void *dx, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - miopenConvAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(miopenFindConvolutionBackwardDataAlgorithm(handle, - dyDesc, - dy, - wDesc, - w, - convDesc, - dxDesc, - dx, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize, - false)); - assert(cnt > 0); - // checkCUDNN(perfResults[0].status); - printf("bwdDataAlgo(%d) time(%.2lf)\n", - perfResults[0].bwd_data_algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].bwd_data_algo; -} - } // namespace Conv2D } // namespace Kernels } // namespace FlexFlow diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/runtime/src/ops/conv_2d.cc index 0679fff96b..88f82944f2 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/runtime/src/ops/conv_2d.cc @@ -28,6 +28,9 @@ enum Slots { OpTaskInvocation init(Conv2DAttrs const &attrs) { OpTaskBinding binding; + binding.bind(INPUT, input_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); + binding.bind(FILTER, weight_tensor(0)); binding.bind_arg(ATTRS, attrs); binding.bind_arg(HANDLE, ff_handle()); @@ -61,31 +64,27 @@ static DeviceSpecific PerDeviceFFHandle handle = acc.get_argument(HANDLE); auto const &attrs = acc.get_argument(ATTRS); - ffTensorDescriptor_t inputTensor; - ffTensorDescriptor_t biasTensor; - ffTensorDescriptor_t outputTensor; - ffFilterDescriptor_t filterDesc; - ffActivationDescriptor_t actiDesc; - ffConvolutionDescriptor_t convDesc; - ffConvolutionFwdAlgo_t fwdAlgo; - ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; - ffConvolutionBwdDataAlgo_t bwdDataAlgo; + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto filter = acc.get_tensor(FILTER); + auto filter_grad = acc.get_tensor_grad(FILTER); DeviceSpecific per_device_state = acc.create_device_specific( init_kernel(handle, - inputTensor, - biasTensor, - outputTensor, - filterDesc, - actiDesc, - convDesc, - fwdAlgo, - bwdFilterAlgo, - bwdDataAlgo, attrs.activation, - attrs.use_bias)); - + attrs.use_bias, + attrs.kernel_h, + attrs.kernel_w, + attrs.groups, + attrs.padding_h, + attrs.padding_w, + attrs.stride_h, + attrs.stride_w, + input, + output, + filter.get_float_ptr(), + filter_grad.get_float_ptr())); return per_device_state; } @@ -174,6 +173,9 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape); SimTaskBinding init_binding; + init_binding.bind(INPUT, input_shape); + init_binding.bind(OUTPUT, output_shape); + init_binding.bind(FILTER, filter_shape); init_binding.bind_arg(ATTRS, attrs); init_binding.bind_arg(HANDLE, ff_handle()); @@ -206,6 +208,9 @@ template <> void register_task() { OpTaskSignature init(OpTaskType::INIT); + init.add_input_slot(INPUT); + init.add_output_slot(OUTPUT); + init.add_weight_slot(FILTER); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/runtime/src/ops/conv_2d.h index eefae82c4c..7225099a47 100644 --- a/lib/runtime/src/ops/conv_2d.h +++ b/lib/runtime/src/ops/conv_2d.h @@ -24,802 +24,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ProfilingSettings const &settings, MachineView const &machine_view); -/* namespace Conv2DInput { */ -/* static constexpr int INDEX = 0; */ - -/* enum { WIDTH = 0, HEIGHT = 1, CHANNEL = 2, SAMPLE = 3, REPLICA = 4, NUMDIM }; - */ -/* } */ - -/* namespace Conv2DOutput { */ -/* enum { WIDTH = 0, HEIGHT = 1, CHANNEL = 2, SAMPLE = 3, REPLICA = 4, NUMDIM }; - */ -/* } */ - -/* namespace Conv2DKernel { */ -/* static constexpr int INDEX = 0; */ - -/* enum { */ -/* WIDTH = 0, */ -/* HEIGHT = 1, */ -/* CHANNEL_IN = 2, */ -/* CHANNEL_OUT = 3, */ -/* REPLICA = 4, */ -/* NUMDIM */ -/* }; */ -/* } */ - -/* /1* namespace Conv2DBias { *1/ */ -/* /1* static constexpr int INDEX = 1; *1/ */ - -/* class Conv2D : public Op { */ -/* public: */ -/* Conv2D(FFModel &model, */ -/* LayerID const &layer_guid, */ -/* const ParallelTensor input, */ -/* int outChannels, */ -/* int kernelH, */ -/* int kernelW, */ -/* int strideH, */ -/* int strideW, */ -/* int paddingH, */ -/* int paddingW, */ -/* ActiMode activation, */ -/* int groups, */ -/* bool use_bias, */ -/* bool allocate_weights, */ -/* char const *name); */ -/* Conv2D(FFModel &model, */ -/* Conv2D const &other, */ -/* const ParallelTensor input, */ -/* bool allocate_weights); */ -/* Conv2D(FFModel &model, */ -/* Conv2DAttrs const &attrs, */ -/* std::vector const &inputs, */ -/* char const *name = nullptr, */ -/* bool allocate_weights = false); */ -/* void init(FFModel const &) override; */ -/* void forward(FFModel const &) override; */ -/* void backward(FFModel const &) override; */ -/* // void update(const FFModel&); */ -/* // Parameter* get_parameter(int index); */ -/* // void create_weights(FFModel& model); */ -/* // void create_input_partition(FFModel& model); */ -/* static Op * */ -/* create_operator_from_layer(FFModel &model, */ -/* Layer const *layer, */ -/* std::vector const &inputs); - */ - -/* static PerDeviceOpState *init_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void forward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void backward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* bool measure_operator_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ -/* bool estimate_sync_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ - -/* /1* static void *1/ */ -/* /1* construct_output_mappings(std::vector &); - * *1/ */ -/* /1* static void construct_mappings(std::vector &, - * *1/ */ -/* /1* bool use_bias); *1/ */ -/* /1* static void - * construct_weight_mappings(std::vector &, *1/ */ -/* /1* bool use_bias); *1/ */ - -/* public: */ -/* int in_channels, out_channels, kernel_h, kernel_w, stride_h, stride_w, */ -/* padding_h, padding_w; */ -/* ActiMode activation; */ -/* int groups; */ -/* bool use_bias; */ -/* }; */ - } // namespace FlexFlow #endif - -// Tensor FFModel::conv2d(Tensor const &input, -// int outChannels, -// int kernelH, -// int kernelW, -// int strideH, -// int strideW, -// int paddingH, -// int paddingW, -// ActiMode activation, -// int groups, -// bool use_bias, -// Layer const *shared_op, -// Initializer *kernel_initializer, -// Initializer *bias_initializer, -// char const *name) { -// assert(input->num_dims() == 4); /*NCHW*/ - -// Conv2DAttrs attrs = {outChannels, -// kernelH, -// kernelW, -// strideH, -// strideW, -// paddingH, -// paddingW, -// groups, -// activation, -// use_bias}; - -// TensorShape output_shape = get_output_shape(attrs, input->get_shape()); -// Tensor output = this->tensor_mgr.create(output_shape, CreateGrad::YES, -// conv); - -// std::vector weights; - -// TensorShape kernel_shape = get_kernel_shape(attrs, input->get_shape()); -// weights.push_back(this->tensor_mgr.create( -// kernel_shape, CreateGrad::YES, kernel_initializer, CHOSEN_SYNC_TYPE)); - -// if (use_bias) { -// TensorShape bias_shape = get_bias_shape(attrs, input->get_shape()); -// weights.push_back(this->tensor_mgr.create( -// bias_shape, CreateGrad::YES, bias_initializer, CHOSEN_SYNC_TYPE)); -// } - -// Layer *conv = -// this->layer_mgr.create(attrs, DT_FLOAT, name, {input}, weights, -// {output}); - -// //{ -// // int numdims = 4; -// // int dims[MAX_TENSOR_DIM]; -// // dims[3] = input->dims[3]; -// // dims[2] = outChannels; -// // dims[1] = 1 + (input->dims[1] + 2 * paddingH - kernelH) / strideH; -// // dims[0] = 1 + (input->dims[0] + 2 * paddingW - kernelW) / strideW; -// // conv->outputs[0] = create_tensor_legion_ordering( -// // numdims, dims, DT_FLOAT, conv, 0, true /*create_grad*/); -// //} -// //{ -// // int dims[4] = {kernelW, kernelH, input->dims[2], outChannels}; -// // conv->weights[0] = create_weight_legion_ordering(4, -// // dims, -// // DT_FLOAT, -// // conv, -// // true /*create_grad*/, -// // kernel_initializer, -// // CHOSEN_SYNC_TYPE); -// //} -// // if (use_bias) { -// // int dims[1] = {outChannels}; -// // conv->weights[1] = create_weight_legion_ordering(1, -// // dims, -// // DT_FLOAT, -// // conv, -// // true /*create_grad*/, -// // bias_initializer, -// // CHOSEN_SYNC_TYPE); -// //} -// conv->add_initializer("kernel", kernel_initializer); -// conv->add_initializer("bias", bias_initializer); -// /* layers.push_back(conv); */ -// return conv->outputs[0]; -// } - -// Op *Conv2D::create_operator_from_layer( -// FFModel &model, -// Layer const *layer, -// std::vector const &inputs) { -// return new Conv2D(model, -// get(layer->attrs), -// inputs, -// layer->name, -// false /*allocate_weights*/ -// ); -// } - -/* void Conv2DParams::mark_replica_dims( */ -/* ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* if (output_dims != nullptr) { */ -/* output_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* kernel_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ -/* } */ -/* if (bias_dims != nullptr) { */ -/* bias_dims[Conv2DBias::REPLICA_1].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_2].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_3].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_4].is_replica_dim = true; */ -/* } */ -/* } */ - -/* int Conv2DParams::output_size(ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM]) const - * { */ -/* int input_w = input.dims[Conv2DInput::WIDTH].size; */ -/* int input_h = input.dims[Conv2DInput::HEIGHT].size; */ - -/* output_dims[Conv2DOutput::SAMPLE].size = - * input.dims[Conv2DInput::SAMPLE].size; */ -/* output_dims[Conv2DOutput::CHANNEL].size = out_channels; */ -/* output_dims[Conv2DOutput::HEIGHT].size = */ -/* 1 + (input_h + 2 * padding_h - kernel_h) / stride_h; */ -/* output_dims[Conv2DOutput::WIDTH].size = */ -/* 1 + (input_w + 2 * padding_w - kernel_w) / stride_w; */ - -/* return input.num_dims; */ -/* }; */ - -/* int Conv2DParams::kernel_size(ParallelTensorShape const &input, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM]) const - * { */ -/* kernel_dims[Conv2DKernel::CHANNEL_OUT].size = this->out_channels; */ -/* kernel_dims[Conv2DKernel::CHANNEL_IN].size = */ -/* input.dims[Conv2DInput::CHANNEL].size / this->groups; */ -/* kernel_dims[Conv2DKernel::HEIGHT].size = */ -/* this->kernel_h * input.dims[Conv2DInput::HEIGHT].degree; */ -/* kernel_dims[Conv2DKernel::WIDTH].size = */ -/* this->kernel_w * input.dims[Conv2DInput::WIDTH].degree; */ - -/* return Conv2DKernel::NUMDIM; */ -/* } */ - -/* int Conv2DParams::bias_size(ParallelTensorShape const &input, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* bias_dims[Conv2DBias::CHANNEL].size = this->out_channels; */ - -/* return Conv2DBias::NUMDIM; */ -/* }; */ - -/* void Conv2DParams::solve_dims(ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* int *output_ndims, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* int *kernel_ndims, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ -/* int *bias_ndims) const { */ -/* assert((output_dims == nullptr) == (output_ndims == nullptr)); */ -/* assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); */ -/* assert((bias_dims == nullptr) == (bias_ndims == nullptr)); */ - -/* std::vector mapping; */ -/* Conv2D::construct_mappings(mapping, this->use_bias); */ - -/* this->mark_replica_dims(input, output_dims, kernel_dims, bias_dims); */ - -/* std::vector output_dim_sets; */ -/* if (output_dims != nullptr) { */ -/* output_dim_sets.push_back(output_dims); */ -/* } */ - -/* std::vector weight_dim_sets; */ -/* if (kernel_dims != nullptr) { */ -/* weight_dim_sets.push_back(kernel_dims); */ -/* } */ -/* if (bias_dims != nullptr && this->use_bias) { */ -/* weight_dim_sets.push_back(bias_dims); */ -/* } */ - -/* solve_parallel_dim_mappings( */ -/* mapping, {input.dims}, weight_dim_sets, output_dim_sets); */ - -/* if (output_dims != nullptr) { */ -/* *output_ndims = this->output_size(input, output_dims); */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* *kernel_ndims = this->kernel_size(input, kernel_dims); */ -/* } */ -/* if (bias_dims != nullptr && this->use_bias) { */ -/* *bias_ndims = this->bias_size(input, bias_dims); */ -/* } */ -/* } */ - -/*static*/ -/* void Conv2D::construct_mappings(std::vector &out, - */ -/* bool use_bias) { */ -/* Conv2D::construct_output_mappings(out); */ -/* Conv2D::construct_weight_mappings(out, use_bias); */ -/* } */ - -/*static*/ -/* void Conv2D::construct_output_mappings( */ -/* std::vector &out) { */ -/* Op::construct_output_parallel_dims( */ -/* out, */ -/* {{Conv2DInput::CHANNEL, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DOutput::REPLICA}, */ -/* {Conv2DInput::SAMPLE, MappingOperation::PARTITION, - * Conv2DOutput::SAMPLE}, */ -/* {Conv2DInput::REPLICA, */ -/* MappingOperation::PARTITION, */ -/* Conv2DOutput::CHANNEL}, */ -/* {Conv2DInput::HEIGHT, MappingOperation::PARTITION, - * Conv2DOutput::HEIGHT}, */ -/* {Conv2DInput::WIDTH, MappingOperation::PARTITION, - * Conv2DOutput::WIDTH}}); */ -/* } */ - -/*static*/ -/* void Conv2D::construct_weight_mappings( */ -/* std::vector &out, bool use_bias) { */ -/* Op::construct_weight_parallel_dims( */ -/* out, */ -/* { */ -/* {Conv2DInput::REPLICA, */ -/* MappingOperation::PARTITION, */ -/* Conv2DKernel::CHANNEL_OUT}, */ -/* {Conv2DInput::SAMPLE, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::REPLICA}, */ -/* {Conv2DInput::CHANNEL, */ -/* MappingOperation::PARTITION, */ -/* Conv2DKernel::CHANNEL_IN}, */ -/* {Conv2DInput::HEIGHT, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::HEIGHT}, // Kernel::{HEIGHT, WEIGHT} would both work - */ -/* // here */ -/* {Conv2DInput::WIDTH, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::WIDTH}, // same as above */ -/* }, */ -/* Conv2DInput::INDEX, */ -/* Conv2DKernel::INDEX); */ - -/* if (use_bias) { */ -/* Op::construct_weight_parallel_dims( */ -/* out, */ -/* {{Conv2DInput::REPLICA, Conv2DBias::REPLICA_1}, */ -/* {Conv2DInput::SAMPLE, Conv2DBias::REPLICA_2}, */ -/* {Conv2DInput::CHANNEL, Conv2DBias::CHANNEL}, */ -/* {Conv2DInput::HEIGHT, Conv2DBias::REPLICA_3}, */ -/* {Conv2DInput::WIDTH, Conv2DBias::REPLICA_4}}, */ -/* Conv2DInput::INDEX, */ -/* Conv2DBias::INDEX); */ -/* } */ -/* } */ - -// Conv2D::Conv2D(FFModel &model, -// Conv2D const &other, -// const ParallelTensor input, -// bool allocate_weights) -// : Conv2D(model, -// other.layer_guid, -// input, -// other.out_channels, -// other.kernel_h, -// other.kernel_w, -// other.stride_h, -// other.stride_w, -// other.padding_h, -// other.padding_w, -// other.activation, -// other.groups, -// other.use_bias, -// allocate_weights, -// other.name) {} - -// Conv2D::Conv2D(FFModel &model, -// Conv2DAttrs const &attrs, -// std::vector const &inputs, -// char const *name, -// bool allocate_weights) -// : Conv2D(model, -// params.layer_guid, -// input, -// params.out_channels, -// params.kernel_h, -// params.kernel_w, -// params.stride_h, -// params.stride_w, -// params.padding_h, -// params.padding_w, -// params.activation, -// params.groups, -// params.use_bias, -// allocate_weights, -// name) {} - -/* bool Conv2DParams::is_valid(ParallelTensorShape const &input) const { */ -/* ParallelTensorShape output_shape, kernel_shape, bias_shape; */ -/* this->solve_dims(input, */ -/* output_shape.dims, */ -/* &output_shape.num_dims, */ -/* kernel_shape.dims, */ -/* &kernel_shape.num_dims, */ -/* bias_shape.dims, */ -/* &bias_shape.num_dims); */ -/* bool is_valid = true; */ -/* is_valid &= input.is_valid(); */ -/* is_valid &= output_shape.is_valid(); */ -/* is_valid &= kernel_shape.is_valid(); */ -/* if (use_bias) { */ -/* is_valid &= bias_shape.is_valid(); */ -/* } */ - -/* // TODO FIXME: Currently disable parallelizing the height and width - * dimension */ -/* if (input.dims[0].degree > 1 || input.dims[1].degree > 1) { */ -/* return false; */ -/* } */ - -/* return is_valid; */ -/* } */ - -// Conv2D::Conv2D(FFModel &model, -// LayerID const &_layer_guid, -// const ParallelTensor input, -// int outChannels, -// int kernelH, -// int kernelW, -// int strideH, -// int strideW, -// int paddingH, -// int paddingW, -// ActiMode activation, -// int groups, -// bool use_bias, -// bool allocate_weights, -// char const *name) -// : Op(model, -// OP_CONV2D, -// DT_FLOAT, -// name, -// 1 /*inputs*/, -// use_bias ? 2 : 1 /*weights*/, -// allocate_weights, -// 1 /*outputs*/, -// input), -// in_channels(input->dims[Conv2DInput::CHANNEL].size / -// input->dims[Conv2DInput::CHANNEL].degree), -// out_channels(outChannels), kernel_h(kernelH), kernel_w(kernelW), -// stride_h(strideH), stride_w(strideW), padding_h(paddingH), -// padding_w(paddingW), activation(activation), groups(groups), -// use_bias(use_bias) { -// // overwrite layer_guid -// layer_guid = _layer_guid; -// assert(input->num_dims == Conv2DInput::NUMDIM); -// assert(this->stride_h > 0); -// assert(this->stride_w > 0); - -// ParallelDim output_dims[MAX_TENSOR_DIM], kernel_dims[MAX_TENSOR_DIM], -// bias_dims[MAX_TENSOR_DIM]; -// int output_ndims, kernel_ndims, bias_ndims; - -// this->construct_mappings(*this->parallel_dims_mapping, this->use_bias); -// this->get_params().solve_dims(this->inputs[0]->get_shape(), -// output_dims, -// &output_ndims, -// kernel_dims, -// &kernel_ndims, -// bias_dims, -// &bias_ndims); - -// if (allocate_weights) { -// Initializer *kernel_initializer = new GlorotUniform(std::rand() -// /*seed*/); - -// weights[Conv2DKernel::INDEX] = -// model.create_parallel_weight_legion_ordering(kernel_ndims, -// kernel_dims, -// DT_FLOAT, -// NULL /*owner_op*/, -// true /*create_grad*/, -// kernel_initializer, -// CHOSEN_SYNC_TYPE); - -// if (use_bias) { -// Initializer *bias_initializer = new ZeroInitializer(); - -// weights[Conv2DBias::INDEX] = -// model.create_parallel_weight_legion_ordering(bias_ndims, -// bias_dims, -// DT_FLOAT, -// NULL /*owner_op*/, -// true /*create_grad*/, -// bias_initializer, -// CHOSEN_SYNC_TYPE); -// } -// } - -// outputs[0] = model.create_parallel_tensor_legion_ordering( -// output_ndims, output_dims, DT_FLOAT, this); - -// assert(check_output_input_weight_parallel_dims(allocate_weights)); -// } - -// tl::optional Conv2D::as_dot() const { -// RecordFormatter rr; -// RecordFormatter r; - -// r << this->inputs[0]->get_shape().as_dot(); -// r << "in_channels" << this->in_channels; -// r << "out_channels" << this->out_channels; -// r << "kernel_h" << this->kernel_h; -// r << "kernel_w" << this->kernel_w; -// r << "padding_h" << this->padding_h; -// r << "padding_w" << this->padding_w; -// r << "stride_h" << this->stride_h; -// r << "stride_w" << this->stride_w; -// r << this->outputs[0]->get_shape().as_dot(); -// rr << r; - -// return rr; -// } - -// bool Conv2D::estimate_sync_cost(Simulator *sim, -// MachineView const &view, -// CostMetrics &cost_metrics) const { -// ParallelDim kernel_dims[MAX_TENSOR_DIM], bias_dims[MAX_TENSOR_DIM]; -// int kernel_ndims, bias_ndims; - -// this->get_params().solve_dims(this->inputs[0]->get_shape(), -// nullptr, -// nullptr, -// kernel_dims, -// &kernel_ndims, -// bias_dims, -// &bias_ndims); - -// cost_metrics.sync_time = -// sim->default_estimate_sync_cost(kernel_dims, kernel_ndims, view); - -// if (this->use_bias) { -// cost_metrics.sync_time += -// sim->default_estimate_sync_cost(bias_dims, bias_ndims, view); -// } - -// return true; -// } - -/* - region(I): input - region(I/O): input_grad (if trainableInputs[0]) - region(I): output - region(I/O): output_grad - region(I): filter - region(I/O): filter_grad - region(I/O): bias_grad (if use_bias) -*/ - -/* - regions[0](I): input - regions[1](O): output - regions[2](I): filter - regions[3](I): bias -*/ - -// void Conv2D::forward(FFModel const &ff) { -// this->execute_task(ff, CONV2D_FWD_TASK_ID, get_fwd_task_signature()); -// } - -// void Conv2D::backward(FFModel const &ff) { -// this->execute_task(ff, CONV2D_bWD_TASK_ID, get_bwd_task_signature()); -// } - -// TaskSpec Conv2D::get_tasks_spec() const { -// OpTasksSpec spec { -// CONV2D_INIT_TASK_ID, -// CONV2D_FWD_TASK_ID, -// CONV2D_BWD_TASK_ID -// }; -// auto &fwd = spec.get_fwd(); - -// fwd.add_input_slot(INPUT); -// fwd.add_param_slot(KERNEL); -// fwd.add_output_slot(OUTPUT); - -// auto input = spec.input_tensor(0); -// auto kernel = spec.param_tensor(0); -// auto bias = spec.param_tensor(1); -// auto output = spec.output_tensor(0); - -// fwd[INPUT] = input; -// fwd[KERNEL] = kernel; -// if (this->use_bias) { -// fwd[BIAS] = bias; -// } -// fwd[OUTPUT] = output; - -// return spec; -// } - -/* TaskSpec Conv2D::get_forward_task_spec() const { */ -/* TaskSpec spec = { CONV2D_FWD_TASK_ID, Pass::FWD }; */ - -/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ -/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ -/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ -/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ - -/* spec.add_input(INPUT, input); */ -/* spec.add_input(KERNEL, kernel); */ - -/* if (this->use_bias) { */ -/* spec.add_input(BIAS, bias); */ -/* } */ - -/* spec.add_output(OUTPUT, output); */ - -/* return spec; */ -/* } */ - -/* TaskSpec Conv2D::get_backward_task_spec() const { */ -/* TaskSpec spec = { CONV2D_BWD_TASK_ID, Pass::BWD }; */ - -/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ -/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ -/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ -/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ - -/* spec.add_input(INPUT, input); */ -/* spec.add_output(INPUT_GRAD, input.grad); */ -/* spec.add_input(KERNEL, kernel); */ -/* spec.add_output(KERNEL_GRAD, kernel.grad); */ - -/* if (this->use_bias) { */ -/* spec.add_input(BIAS, bias); */ -/* spec.add_output(BIAS_GRAD, bias.grad); */ -/* } */ - -/* spec.add_input(OUTPUT, output); */ -/* spec.add_input(OUTPUT_GRAD, output.grad); */ - -/* return spec; */ -/* } */ - -/* - regions[0]: input - regions[1]: output - regions[2](I): filter - regions[3](I): bias - regions[4](O): filter_grad - regions[5](O): input_grad -*/ - -// void Conv2D::init(FFModel const &ff) { -// this->execute_task(ff, CONV2D_INIT_TASK_ID, get_init_task_signature()); -// assert(check_output_input_weight_same_parallel_is()); -// parallel_is = outputs[0]->parallel_is; -// ArgumentMap argmap; -// Context ctx = ff.config.lg_ctx; -// Runtime *runtime = ff.config.lg_hlr; -// set_argumentmap_for_init(ff, argmap); -// IndexLauncher launcher(CONV2D_INIT_TASK_ID, -// parallel_is, -// TaskArgument(this, sizeof(Conv2D)), -// argmap, -// Predicate::TRUE_PRED, -// false /*must*/, -// 0 /*mapper_id*/, -// outputs[0]->machine_view.hash()); -// launcher.add_region_requirement(RegionRequirement(inputs[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// inputs[0]->region)); -// launcher.add_field(0, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(outputs[0]->part, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// outputs[0]->region)); -// launcher.add_field(1, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(weights[0]->part, -// 0 /*projection id*/, -// READ_ONLY, -// EXCLUSIVE, -// weights[0]->region)); -// launcher.add_field(2, FID_DATA); -// // launcher.add_region_requirement( -// // RegionRequirement(weights[1]->part, 0/*projection id*/, -// // READ_ONLY, EXCLUSIVE, weights[1]->region)); -// // launcher.add_field(3, FID_DATA); -// launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, -// 0 /*projection id*/, -// WRITE_ONLY, -// EXCLUSIVE, -// weights[0]->region_grad)); -// launcher.add_field(3, FID_DATA); -// // launcher.add_region_requirement( -// // RegionRequirement(inputs[0]->part_grad, 0/*projection id*/, -// // WRITE_ONLY, EXCLUSIVE, inputs[0]->region_grad)); -// // launcher.add_field(4, FID_DATA); -// FutureMap fm = runtime->execute_index_space(ctx, launcher); -// fm.wait_all_results(); -// set_opmeta_from_futuremap(ff, fm); -// } - -// printf("init conv (input): n(%d) c(%d) h(%d) w(%d)\n", -// input_n, -// input_c, -// input_h, -// input_w); -// printf("init conv (output): n(%d) c(%d) h(%d) w(%d)\n", -// output_n, -// output_c, -// output_h, -// output_w); - -// printf("convDim: padding(%d %d) stride(%d %d)\n", conv->padding_h, -// conv->padding_w, conv->stride_h, conv->stride_w); -// int pad_h = -// ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2; -// int pad_w = -// ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2; -// if (pad_h != attrs.padding_h) { -// printf("Warning: changing conv_padding_h to satisfy output_h size\n"); -// } -// if (pad_w != attrs.padding_w) { -// printf("Warning: changing conv_padding_w to satisfy output_w size\n"); -// } - -// size_t rid = 0; -// TensorAccessorR acc_input( -// regions[rid], task->regions[rid], FID_DATA, ctx, runtime); -// rid++; -// float *acc_input_grad_ptr = NULL; -// if (m->trainableInputs[0]) { -// TensorAccessorW acc_input_grad( -// regions[rid], -// task->regions[rid], -// FID_DATA, -// ctx, -// runtime, -// true /*readOutput*/); -// acc_input_grad_ptr = acc_input_grad.ptr; -// rid++; -// } -// TensorAccessorR acc_output( -// regions[rid], task->regions[rid], FID_DATA, ctx, runtime); -// rid++; -// TensorAccessorW acc_output_grad( -// regions[rid], -// task->regions[rid], -// FID_DATA, -// ctx, -// runtime, -// true /*readOutput*/); -// rid++; -// TensorAccessorR acc_kernel( -// regions[rid], task->regions[rid], FID_DATA, ctx, runtime); -// rid++; -// TensorAccessorW acc_kernel_grad( -// regions[rid], -// task->regions[rid], -// FID_DATA, -// ctx, -// runtime, -// true /*readOutput*/); -// rid++; -// float *acc_bias_grad_ptr = NULL; -// if (m->use_bias) { -// TensorAccessorW acc_bias_grad( -// regions[rid], -// task->regions[rid], -// FID_DATA, -// ctx, -// runtime, -// true /*readOutput*/); -// acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); -// rid++; -// } \ No newline at end of file From 5d15c079937e5d15fe86b5695bd3b39bd909f855 Mon Sep 17 00:00:00 2001 From: Kate Unger Date: Sun, 10 Sep 2023 11:52:02 -0700 Subject: [PATCH 3/5] pr changes --- lib/kernels/include/kernels/conv_2d_kernels.h | 4 ++-- lib/runtime/src/ops/conv_2d.cc | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index a133f9e151..a99fcc73e4 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -58,14 +58,14 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, float *filter_grad_ptr); void forward_kernel(ffStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, float const *input_ptr, float *output_ptr, float const *filter_ptr, float const *bias_ptr); void backward_kernel(ffStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/runtime/src/ops/conv_2d.cc index 88f82944f2..15bcaaba73 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/runtime/src/ops/conv_2d.cc @@ -62,8 +62,7 @@ static DeviceSpecific init_task_impl(TaskArgumentAccessor const &acc) { PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto const &attrs = acc.get_argument(ATTRS); - + auto attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto filter = acc.get_tensor(FILTER); From 3aad5eb5802ba65173e5b6826c6089e629475f41 Mon Sep 17 00:00:00 2001 From: Kate Unger Date: Sun, 10 Sep 2023 14:59:30 -0700 Subject: [PATCH 4/5] format --- lib/kernels/include/kernels/accessor.h | 10 +- lib/kernels/include/kernels/conv_2d_kernels.h | 54 +++++------ lib/kernels/include/kernels/ff_handle.h | 15 ++- lib/kernels/src/cuda/conv_2d_kernels.cu | 94 +++++++++---------- lib/kernels/src/hip/conv_2d_kernels.cpp | 31 +++--- lib/runtime/src/ops/conv_2d.cc | 2 +- 6 files changed, 110 insertions(+), 96 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 32600fca8f..c65c2befb8 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -34,7 +34,10 @@ class GenericTensorAccessorW { ArrayShape shape; req ptr; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, data_type, shape, ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, + data_type, + shape, + ptr); class GenericTensorAccessorR { public: @@ -59,7 +62,10 @@ class GenericTensorAccessorR { ArrayShape shape; req ptr; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, data_type, shape, ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, + data_type, + shape, + ptr); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index a99fcc73e4..28e669886a 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -1,11 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H +#include "kernels/accessor.h" #include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.h" #include "utils/visitable.h" -#include "kernels/accessor.h" namespace FlexFlow { @@ -26,36 +26,36 @@ struct Conv2DPerDeviceState { }; FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, - handle, - activation, - use_bias, - inputTensor, - biasTensor, - outputTensor, - filterDesc, - actiDesc, - convDesc, - fwdAlgo, - bwdFilterAlgo, - bwdDataAlgo); + handle, + activation, + use_bias, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo); namespace Kernels { namespace Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - optional activation, - bool use_bias, - int kernel_h, - int kernel_w, - int groups, - int padding_h, - int padding_w, - int stride_h, - int stride_w, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - float const *filter_ptr, - float *filter_grad_ptr); + optional activation, + bool use_bias, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr); void forward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, @@ -78,4 +78,4 @@ void backward_kernel(ffStream_t stream, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index df99d96f51..e15ee4ff32 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -24,9 +24,20 @@ struct PerDeviceFFHandle { }; #ifdef FF_USE_NCCL -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, dnn, blas, workSpace, workSpaceSize, allowTensorOpMathConversion, ncclComm); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, + dnn, + blas, + workSpace, + workSpaceSize, + allowTensorOpMathConversion, + ncclComm); #else -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, dnn, blas, workSpace, workSpaceSize, allowTensorOpMathConversion); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, + dnn, + blas, + workSpace, + workSpaceSize, + allowTensorOpMathConversion); #endif } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/conv_2d_kernels.cu b/lib/kernels/src/cuda/conv_2d_kernels.cu index aa4365c096..a6291baad1 100644 --- a/lib/kernels/src/cuda/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/conv_2d_kernels.cu @@ -1,12 +1,11 @@ +#include "device.h" #include "kernels/conv_2d_kernels.h" #include "kernels/device.h" -#include "device.h" namespace FlexFlow { namespace Kernels { namespace Conv2D { - cudnnConvolutionBwdDataAlgo_t selectConvolutionBackwardDataAlgorithm( cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, @@ -125,19 +124,19 @@ cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( } Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - optional activation, - bool use_bias, - int kernel_h, - int kernel_w, - int groups, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - GenericTensorAccessorW const &input, - GenericTensorAccessorW const &output, - float const *filter_ptr, - float *filter_grad_ptr) { + optional activation, + bool use_bias, + int kernel_h, + int kernel_w, + int groups, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t biasTensor; @@ -227,16 +226,16 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, float time; // select forward algorithm fwdAlgo = selectConvolutionForwardAlgorithm(handle.dnn, - inputTensor, - input.get_float_ptr(), - filterDesc, - filter_ptr, - convDesc, - handle.workSpace, - handle.workSpaceSize, - outputTensor, - output.get_float_ptr(), - &time); + inputTensor, + input.get_float_ptr(), + filterDesc, + filter_ptr, + convDesc, + handle.workSpace, + handle.workSpaceSize, + outputTensor, + output.get_float_ptr(), + &time); // select backward filter algorithm bwdFilterAlgo = @@ -253,35 +252,34 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, &time); // select backward data algorithm - bwdDataAlgo = - selectConvolutionBackwardDataAlgorithm(handle.dnn, - filterDesc, - filter_ptr, - outputTensor, - output.get_float_ptr(), - convDesc, - handle.workSpace, - handle.workSpaceSize, - inputTensor, - input.get_float_ptr(), - &time); + bwdDataAlgo = selectConvolutionBackwardDataAlgorithm(handle.dnn, + filterDesc, + filter_ptr, + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + inputTensor, + input.get_float_ptr(), + &time); if (activation.has_value()) { checkCUDNN(cudnnSetActivationDescriptor( actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); } Conv2DPerDeviceState per_device_state = {handle, - activation, - use_bias, - inputTensor, - biasTensor, - outputTensor, - filterDesc, - actiDesc, - convDesc, - fwdAlgo, - bwdFilterAlgo, - bwdDataAlgo}; + activation, + use_bias, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo}; return per_device_state; } diff --git a/lib/kernels/src/hip/conv_2d_kernels.cpp b/lib/kernels/src/hip/conv_2d_kernels.cpp index 78a7e0f4d4..0a1b30655a 100644 --- a/lib/kernels/src/hip/conv_2d_kernels.cpp +++ b/lib/kernels/src/hip/conv_2d_kernels.cpp @@ -19,7 +19,6 @@ namespace FlexFlow { namespace Kernels { namespace Conv2D { - miopenConvFwdAlgorithm_t selectConvolutionForwardAlgorithm( miopenHandle_t handle, const miopenTensorDescriptor_t xDesc, @@ -141,19 +140,19 @@ miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( } Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - optional activation, - bool use_bias, - int kernel_h, - int kernel_w, - int groups, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - float const *filter_ptr, - float *filter_grad_ptr) { + optional activation, + bool use_bias, + int kernel_h, + int kernel_w, + int groups, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { miopenCreateTensorDescriptor inputTensor; miopenCreateTensorDescriptor biasTensor; @@ -185,8 +184,8 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, checkCUDNN(miopenSet4dTensorDescriptor( inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); - checkCUDNN(miopenSet4dTensorDescriptor( - biasTensor, miopenFloat, 1, output_c, 1, 1)); + checkCUDNN( + miopenSet4dTensorDescriptor(biasTensor, miopenFloat, 1, output_c, 1, 1)); // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/runtime/src/ops/conv_2d.cc index 15bcaaba73..fb9c0880f8 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/runtime/src/ops/conv_2d.cc @@ -239,4 +239,4 @@ void register_task() { register_task(CONV2D_BWD_TASK_ID, "Conv2D Bwd", bwd, backward_task); } -} // namespace FlexFlow \ No newline at end of file +} // namespace FlexFlow From 321e4064a258b3c890b81bdffcaa949d2d69adb7 Mon Sep 17 00:00:00 2001 From: Kate Unger Date: Tue, 12 Sep 2023 13:42:25 -0700 Subject: [PATCH 5/5] conv2d --- lib/kernels/include/kernels/conv_2d_kernels.h | 14 +-- lib/kernels/src/cuda/conv_2d_kernels.cu | 112 ++++++++---------- lib/kernels/src/hip/conv_2d_kernels.cpp | 37 ++---- lib/runtime/src/ops/conv_2d.cc | 18 ++- 4 files changed, 78 insertions(+), 103 deletions(-) diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index 28e669886a..e8ae6fd014 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -11,9 +11,6 @@ namespace FlexFlow { struct Conv2DPerDeviceState { PerDeviceFFHandle handle; - optional activation; - bool use_bias; - ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t biasTensor; ffTensorDescriptor_t outputTensor; @@ -22,13 +19,11 @@ struct Conv2DPerDeviceState { ffConvolutionDescriptor_t convDesc; ffConvolutionFwdAlgo_t fwdAlgo; ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; - ffConvolutionBwdDataAlgo_t bwdDataAlgo; + req bwdDataAlgo; }; FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, handle, - activation, - use_bias, inputTensor, biasTensor, outputTensor, @@ -44,7 +39,6 @@ namespace Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, optional activation, - bool use_bias, int kernel_h, int kernel_w, int groups, @@ -62,7 +56,8 @@ void forward_kernel(ffStream_t stream, float const *input_ptr, float *output_ptr, float const *filter_ptr, - float const *bias_ptr); + float const *bias_ptr, + optional activation); void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, @@ -72,7 +67,8 @@ void backward_kernel(ffStream_t stream, float *output_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, - float *bias_grad_ptr); + float *bias_grad_ptr, + optional activation); } // namespace Conv2D } // namespace Kernels diff --git a/lib/kernels/src/cuda/conv_2d_kernels.cu b/lib/kernels/src/cuda/conv_2d_kernels.cu index a6291baad1..f54b045db8 100644 --- a/lib/kernels/src/cuda/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/conv_2d_kernels.cu @@ -36,9 +36,6 @@ cudnnConvolutionBwdDataAlgo_t selectConvolutionBackwardDataAlgorithm( workSpaceSize)); assert(cnt > 0); checkCUDNN(perfResults[0].status); - printf("bwdDataAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); if (time != nullptr) { *time = perfResults[0].time; } @@ -75,9 +72,6 @@ cudnnConvolutionFwdAlgo_t selectConvolutionForwardAlgorithm( workSpaceSize)); assert(cnt > 0); checkCUDNN(perfResults[0].status); - printf("forwardAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); if (time != nullptr) { *time = perfResults[0].time; } @@ -114,9 +108,6 @@ cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( workSpaceSize)); assert(cnt > 0); checkCUDNN(perfResults[0].status); - printf("bwdFilterAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); if (time != nullptr) { *time = perfResults[0].time; } @@ -125,7 +116,6 @@ cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, optional activation, - bool use_bias, int kernel_h, int kernel_w, int groups, @@ -178,11 +168,6 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); - printf("filterDim: kernel(%d %d) c_in(%d), c_out(%d)\n", - kernel_h, - kernel_w, - input_c / groups, - output_c); checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, @@ -192,8 +177,8 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, kernel_w)); checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - pad_h, // conv->padding_h, - pad_w, // conv->padding_w, + pad_h, + pad_w, stride_h, stride_w, 1 /*upscale_x*/, @@ -235,7 +220,7 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, handle.workSpaceSize, outputTensor, output.get_float_ptr(), - &time); + nullptr); // select backward filter algorithm bwdFilterAlgo = @@ -249,7 +234,7 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, handle.workSpaceSize, filterDesc, filter_grad_ptr, - &time); + nullptr); // select backward data algorithm bwdDataAlgo = selectConvolutionBackwardDataAlgorithm(handle.dnn, @@ -262,15 +247,13 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, handle.workSpaceSize, inputTensor, input.get_float_ptr(), - &time); + nullptr); if (activation.has_value()) { checkCUDNN(cudnnSetActivationDescriptor( actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); } Conv2DPerDeviceState per_device_state = {handle, - activation, - use_bias, inputTensor, biasTensor, outputTensor, @@ -284,52 +267,53 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, } void forward_kernel(cudaStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, + optional const &activation, float const *input_ptr, float *output_ptr, float const *filter_ptr, float const *bias_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnConvolutionForward(m->handle.dnn, + checkCUDNN(cudnnConvolutionForward(m.handle.dnn, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, - m->filterDesc, + m.filterDesc, filter_ptr, - m->convDesc, - m->fwdAlgo, - m->handle.workSpace, - m->handle.workSpaceSize, + m.convDesc, + m.fwdAlgo, + m.handle.workSpace, + m.handle.workSpaceSize, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); - // use_bias == True if (bias_ptr != NULL) { - checkCUDNN(cudnnAddTensor(m->handle.dnn, + checkCUDNN(cudnnAddTensor(m.handle.dnn, &alpha, - m->biasTensor, + m.biasTensor, bias_ptr, &alpha, - m->outputTensor, + m.outputTensor, output_ptr)); } - if (m->activation) { - checkCUDNN(cudnnActivationForward(m->handle.dnn, - m->actiDesc, + if (activation.has_value()) { + checkCUDNN(cudnnActivationForward(m.handle.dnn, + m.actiDesc, &alpha, - m->outputTensor, + m.outputTensor, output_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); } } void backward_kernel(cudaStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, + optional const &activation, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, @@ -337,14 +321,14 @@ void backward_kernel(cudaStream_t stream, float const *kernel_ptr, float *kernel_grad_ptr, float *bias_grad_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; // float beta = 0.0f; - if (m->activation) { + if (activation.has_value()) { cudnnDataType_t dataType; int n, c, h, w, nStride, cStride, hStride, wStride; - checkCUDNN(cudnnGetTensor4dDescriptor(m->outputTensor, + checkCUDNN(cudnnGetTensor4dDescriptor(m.outputTensor, &dataType, &n, &c, @@ -359,45 +343,45 @@ void backward_kernel(cudaStream_t stream, } // Compute filter gradiant // NOTE: we use alpha for kernel_grad to accumulate gradients - checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn, + checkCUDNN(cudnnConvolutionBackwardFilter(m.handle.dnn, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->convDesc, - m->bwdFilterAlgo, - m->handle.workSpace, - m->handle.workSpaceSize, + m.convDesc, + m.bwdFilterAlgo, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->filterDesc, + m.filterDesc, kernel_grad_ptr)); // Compute bias gradiant // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { - checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn, + checkCUDNN(cudnnConvolutionBackwardBias(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, &alpha, - m->biasTensor, + m.biasTensor, bias_grad_ptr)); } // Compute data gradiant // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { - checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn, + checkCUDNN(cudnnConvolutionBackwardData(m.handle.dnn, &alpha, - m->filterDesc, + m.filterDesc, kernel_ptr, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->convDesc, - m->bwdDataAlgo, - m->handle.workSpace, - m->handle.workSpaceSize, + m.convDesc, + m.bwdDataAlgo, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->inputTensor, + m.inputTensor, input_grad_ptr)); } } diff --git a/lib/kernels/src/hip/conv_2d_kernels.cpp b/lib/kernels/src/hip/conv_2d_kernels.cpp index 0a1b30655a..761a59b545 100644 --- a/lib/kernels/src/hip/conv_2d_kernels.cpp +++ b/lib/kernels/src/hip/conv_2d_kernels.cpp @@ -50,9 +50,6 @@ miopenConvFwdAlgorithm_t selectConvolutionForwardAlgorithm( false)); assert(cnt > 0); // checkCUDNN(perfResults[0].status); - printf("forwardAlgo(%d) time(%.2lf)\n", - perfResults[0].fwd_algo, - perfResults[0].time); if (time != nullptr) { *time = perfResults[0].time; } @@ -90,9 +87,6 @@ miopenConvBwdWeightsAlgorithm_t selectConvolutionBackwardFilterAlgorithm( false)); assert(cnt > 0); // checkCUDNN(perfResults[0].status); - printf("bwdFilterAlgo(%d) time(%.2lf)\n", - perfResults[0].bwd_weights_algo, - perfResults[0].time); if (time != nullptr) { *time = perfResults[0].time; } @@ -130,9 +124,6 @@ miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( false)); assert(cnt > 0); // checkCUDNN(perfResults[0].status); - printf("bwdDataAlgo(%d) time(%.2lf)\n", - perfResults[0].bwd_data_algo, - perfResults[0].time); if (time != nullptr) { *time = perfResults[0].time; } @@ -141,12 +132,11 @@ miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, optional activation, - bool use_bias, int kernel_h, int kernel_w, int groups, - int pad_h, - int pad_w, + int padding_h, + int padding_w, int stride_h, int stride_w, GenericTensorAccessorR const &input, @@ -189,11 +179,6 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); - printf("filterDim: kernel(%d %d) c_in(%d), c_out(%d)\n", - kernel_h, - kernel_w, - input_c / groups, - output_c); checkCUDNN(miopenSet4dTensorDescriptor(m->filterDesc, miopenFloat, output_c, @@ -289,12 +274,13 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, } } -void forward_kernel(hipStream_t stream, - Conv2DPerDeviceState const *m, +void forward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, float const *input_ptr, float *output_ptr, float const *filter_ptr, - float const *bias_ptr) { + float const *bias_ptr, + optional activation) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); @@ -335,15 +321,16 @@ void forward_kernel(hipStream_t stream, } } -void backward_kernel(hipStream_t stream, - Conv2DPerDeviceState const *m, +void backward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *bias_grad_ptr) { + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + optional activation) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/runtime/src/ops/conv_2d.cc index fb9c0880f8..e0e1872a06 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/runtime/src/ops/conv_2d.cc @@ -40,6 +40,7 @@ OpTaskInvocation init(Conv2DAttrs const &attrs) { OpTaskInvocation forward(Conv2DAttrs const &attrs) { OpTaskBinding binding; + binding.bind_arg(ATTRS, attrs); binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); @@ -72,7 +73,6 @@ static DeviceSpecific acc.create_device_specific( init_kernel(handle, attrs.activation, - attrs.use_bias, attrs.kernel_h, attrs.kernel_w, attrs.groups, @@ -100,6 +100,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + auto attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); @@ -109,11 +110,12 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, "[Conv2d] forward_time = %.2lfms\n", - &per_device_state, + per_device_state, input.get_float_ptr(), output.get_float_ptr(), filter.get_float_ptr(), - bias.get_float_ptr()); + bias.get_float_ptr(), + attrs.activation); } static void forward_task(Task const *task, @@ -128,6 +130,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + auto attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -141,14 +144,15 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { return profile(backward_kernel, profiling, "[Conv2d] backward_time = %.2lfms\n", - &per_device_state, + per_device_state, input.get_float_ptr(), input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), - bias_grad.get_float_ptr()); + bias_grad.get_float_ptr(), + attrs.activation); } static void backward_task(Task const *task, @@ -185,6 +189,7 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim, SimTaskBinding fwd_binding; fwd_binding.bind_arg(PROFILING, settings); fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); + init_binding.bind_arg(ATTRS, attrs); fwd_binding.bind(INPUT, input_shape); fwd_binding.bind(OUTPUT, output_shape); @@ -213,6 +218,8 @@ void register_task() { init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); + init.add_return_value(); + register_task(CONV2D_INIT_TASK_ID, "Conv2D Init", init, init_task); } @@ -222,6 +229,7 @@ void register_task() { fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + init.add_arg_slot(ATTRS); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT);