diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 8ea37b1c1c..7cb5048d37 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -5,27 +5,46 @@ namespace FlexFlow { -class LinearPerDeviceState : public PerDeviceOpState { -public: - LinearPerDeviceState(FFHandler handle, int batch_size); +struct LinearPerDeviceState { + PerDeviceFFHandle handle; ffTensorDescriptor_t outputTensor; ffActivationDescriptor_t actiDesc; - -public: - float const *one_ptr; - ActiMode activation; + float const *one_ptr; // how to handle this? + cudnnActivationMode_t activation; optional regularizer; bool use_bias; DataType input_type, weight_type, output_type; }; +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, + handle, + outputTensor, + actiDesc, + one_ptr, + activation, + regularizer, + use_bias, + input_type, + weight_type, + output_type); + namespace Kernels { namespace Linear { -void init_kernel(LinearPerDeviceState *m, int batch_size, int channel); + +LinearPerDeviceState + init_kernel(PerDeviceFFHandle handle, Allocator allocator, float *one_ptr; + optional regularizer, + bool use_bias, + DataType input_type, + DataType weight_type, + DataType output_type, + int batch_size, + int channel); + bool use_activation(ActiMode mode); void forward_kernel(ffStream_t stream, - LinearPerDeviceState const *m, + LinearPerDeviceState const &m, void const *input_ptr, void *output_ptr, void const *filter_ptr, @@ -34,7 +53,7 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - LinearPerDeviceState const *m, + LinearPerDeviceState const &m, void const *input_ptr, void *input_grad_ptr, void const *output_ptr, diff --git a/lib/kernels/src/cuda/linear_kernels.cu b/lib/kernels/src/cuda/linear_kernels.cu index 58a6a6b57e..9fc984f7d6 100644 --- a/lib/kernels/src/cuda/linear_kernels.cu +++ b/lib/kernels/src/cuda/linear_kernels.cu @@ -13,76 +13,78 @@ * limitations under the License. */ +#include "kernels/allocation.h" #include "kernels/cuda_helper.h" #include "kernels/linear_kernels.h" namespace FlexFlow { -LinearPerDeviceState::LinearPerDeviceState(FFHandler handler, int batch_size) - : PerDeviceOpState(handler) { - // Allocate an all-one's vector - float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); - for (int i = 0; i < batch_size; i++) { - dram_one_ptr[i] = 1.0f; - } - float *fb_one_ptr; - checkCUDA(cudaMalloc(&fb_one_ptr, sizeof(float) * batch_size)); - checkCUDA(cudaMemcpy(fb_one_ptr, - dram_one_ptr, - sizeof(float) * batch_size, - cudaMemcpyHostToDevice)); - one_ptr = (float const *)fb_one_ptr; - // Allocate descriptors - checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); - checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); -} - namespace Kernels { namespace Linear { -bool use_activation(ActiMode mode) { - switch (mode) { - case AC_MODE_RELU: - case AC_MODE_SIGMOID: - case AC_MODE_TANH: - return true; - case AC_MODE_NONE: - return false; - default: - assert(0); +// what's the float * one_ptr +LinearPerDeviceState + init_kernel(PerDeviceFFHandle handle, Allocator allocator, float *one_ptr; + ActiMode activation, + Regularizer regularizer, + bool use_bias, + DataType input_type, + DataType weight_type, + DataType output_type, + int batch_size, + int channel) { + ffTensorDescriptor_t outputTensor; + ffActivationDescriptor_t actiDesc; + checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); + checkCUDNN(cudnnSetTensor4dDescriptor(outputTensor, + CUDNN_TENSOR_NCHW, + ff_to_cudnn_datatype(output_type), + batch_size, + channel, + 1, + 1)); + cudnnActivationMode_t mode; + switch (activation) { + case RELU: + mode = CUDNN_ACTIVATION_RELU; + break; + case SIGMOID: + mode = CUDNN_ACTIVATION_SIGMOID; + break; + case TANH: + mode = CUDNN_ACTIVATION_TANH; + break; + case GELU: + mode = CUDNN_ACTIVATION_GELU; break; + default: + // Unsupported activation mode + assert(false); } - return false; -} + checkCUDNN( + cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); + checkCUDNN( + cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); -void init_kernel(LinearPerDeviceState *m, int batch_size, int channel) { - if (use_activation(m->activation)) { - cudnnActivationMode_t mode; - switch (m->activation) { - case AC_MODE_RELU: - mode = CUDNN_ACTIVATION_RELU; - break; - case AC_MODE_SIGMOID: - mode = CUDNN_ACTIVATION_SIGMOID; - break; - default: - // Unsupported activation mode - assert(false); - } - checkCUDNN(cudnnSetActivationDescriptor( - m->actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor, - CUDNN_TENSOR_NCHW, - ff_to_cudnn_datatype(m->output_type), - batch_size, - channel, - 1, - 1)); - } + // todo: how to use allocator to allocate memory for float * one_ptr, how many + // bytes to allocate? + checkCUDA(cudaMalloc(&one_ptr, sizeof(float) * batch_size)); + LinearPerDeviceState per_device_state = {handle, + outputTensor, + actiDesc, + one_ptr, + activation, + regularizer, + use_bias, + input_type, + weight_type, + output_type}; + return per_device_state; } void forward_kernel(cudaStream_t stream, - LinearPerDeviceState const *m, + LinearPerDeviceState const &m, void const *input_ptr, void *output_ptr, void const *weight_ptr, @@ -91,19 +93,19 @@ void forward_kernel(cudaStream_t stream, int out_dim, int batch_size) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDA(cublasSetStream(m.handle.blas, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); - cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); - cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); + cudaDataType_t input_type = ff_to_cuda_datatype(m.input_type); + cudaDataType_t weight_type = ff_to_cuda_datatype(m.weight_type); + cudaDataType_t output_type = ff_to_cuda_datatype(m.output_type); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else cudaDataType_t compute_type = CUDA_R_32F; #endif - checkCUDA(cublasGemmEx(m->handle.blas, + checkCUDA(cublasGemmEx(m.handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, out_dim, @@ -124,7 +126,7 @@ void forward_kernel(cudaStream_t stream, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // use_bias = True if (bias_ptr != NULL) { - checkCUDA(cublasGemmEx(m->handle.blas, + checkCUDA(cublasGemmEx(m.handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, out_dim, @@ -134,7 +136,7 @@ void forward_kernel(cudaStream_t stream, bias_ptr, weight_type, 1, - m->one_ptr, + m.one_ptr, CUDA_R_32F, 1, &alpha, @@ -144,22 +146,22 @@ void forward_kernel(cudaStream_t stream, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - if (use_activation(m->activation)) { - checkCUDNN(cudnnActivationForward(m->handle.dnn, - m->actiDesc, + if (use_activation(m.activation)) { + checkCUDNN(cudnnActivationForward(m.handle.dnn, + m.actiDesc, &alpha, - m->outputTensor, + m.outputTensor, output_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); - } else if (m->activation == AC_MODE_GELU) { + } else if (m.activation == AC_MODE_GELU) { size_t elements = (size_t)out_dim * (size_t)batch_size; constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) gelu_forward_kernel<<>>( elements, B, C, (float *)output_ptr); - } else if (m->activation == AC_MODE_NONE) { + } else if (m.activation == AC_MODE_NONE) { // Do nothing } else { assert(false && "Unsupported activation for Linear"); @@ -167,7 +169,7 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - LinearPerDeviceState const *m, + LinearPerDeviceState const &m, void const *input_ptr, void *input_grad_ptr, void const *output_ptr, @@ -179,13 +181,13 @@ void backward_kernel(cudaStream_t stream, int out_dim, int batch_size) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDA(cublasSetStream(m.handle.blas, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; - cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type); - cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type); - cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type); + cudaDataType_t input_type = ff_to_cuda_datatype(m.input_type); + cudaDataType_t weight_type = ff_to_cuda_datatype(m.weight_type); + cudaDataType_t output_type = ff_to_cuda_datatype(m.output_type); #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; @@ -193,19 +195,19 @@ void backward_kernel(cudaStream_t stream, cudaDataType_t compute_type = CUDA_R_32F; #endif int output_size = out_dim * batch_size; - if (m->activation == AC_MODE_RELU) { + if (m.activation == AC_MODE_RELU) { relu_backward_kernel( - m->output_type, output_grad_ptr, output_ptr, output_size, stream); - } else if (m->activation == AC_MODE_SIGMOID) { + m.output_type, output_grad_ptr, output_ptr, output_size, stream); + } else if (m.activation == AC_MODE_SIGMOID) { sigmoid_backward_kernel( - m->output_type, output_grad_ptr, output_ptr, output_size, stream); + m.output_type, output_grad_ptr, output_ptr, output_size, stream); } else { // TODO: only support relu and sigmoid for now - assert(m->activation == AC_MODE_NONE); + assert(m.activation == AC_MODE_NONE); } // Compute weight gradiant // NOTE: we use alpha=1 for kernel_grad to accumulate gradients - checkCUDA(cublasGemmEx(m->handle.blas, + checkCUDA(cublasGemmEx(m.handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, in_dim, @@ -224,10 +226,10 @@ void backward_kernel(cudaStream_t stream, in_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->kernel_reg_type == REG_MODE_NONE) { + if (m.kernel_reg_type == REG_MODE_NONE) { // do nothing - } else if (m->kernel_reg_type == REG_MODE_L2) { - checkCUDA(cublasSgeam(m->handle.blas, + } else if (m.kernel_reg_type == REG_MODE_L2) { + checkCUDA(cublasSgeam(m.handle.blas, CUBLAS_OP_N, CUBLAS_OP_N, in_dim, @@ -235,7 +237,7 @@ void backward_kernel(cudaStream_t stream, &alpha, (float *)kernel_grad_ptr, in_dim, - &(m->kernel_reg_lambda), + &(m.kernel_reg_lambda), (float *)kernel_ptr, in_dim, (float *)kernel_grad_ptr, @@ -248,14 +250,14 @@ void backward_kernel(cudaStream_t stream, // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { - checkCUDA(cublasGemmEx(m->handle.blas, + checkCUDA(cublasGemmEx(m.handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, 1, out_dim, batch_size, &alpha, - m->one_ptr, + m.one_ptr, CUDA_R_32F, 1, output_grad_ptr, @@ -271,7 +273,7 @@ void backward_kernel(cudaStream_t stream, // Compute data gradiant // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { - checkCUDA(cublasGemmEx(m->handle.blas, + checkCUDA(cublasGemmEx(m.handle.blas, CUBLAS_OP_N, CUBLAS_OP_N, in_dim, diff --git a/lib/runtime/src/ops/element_binary.cc b/lib/runtime/src/ops/element_binary.cc index 92ba0a7fb7..f6be2198ca 100644 --- a/lib/runtime/src/ops/element_binary.cc +++ b/lib/runtime/src/ops/element_binary.cc @@ -213,7 +213,7 @@ OpTaskSignature init_signature() { init.add_return_value(); - return init; + return init; // todo:this may be wrong, because the headfile retrun void } template <> diff --git a/lib/runtime/src/ops/linear.cc b/lib/runtime/src/ops/linear.cc index 00ef5c7b12..96d037913c 100644 --- a/lib/runtime/src/ops/linear.cc +++ b/lib/runtime/src/ops/linear.cc @@ -2,6 +2,10 @@ #include "kernels/linear_kernels.h" #include "layer.h" #include "legion/legion_utilities.h" +#include "op-attrs/ff_dim.h" +#include "op-attrs/get_output_shapes.h" +#include "utils/exceptions.h" +#include "utils/graph/views.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -25,1063 +29,281 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Linear; -static constexpr int KERNEL_IDX = 0; -static constexpr int BIAS_IDX = 1; +enum slots { + INPUT, + OUTPUT, + WEIGHT, + BIAS, + ATTRS, + PROFILING, + HANDLE, + PER_DEVICE_STATE +}; -Tensor FFModel::dense(const Tensor input, - int outDim, - ActiMode activation, - bool use_bias, - DataType data_type, - Layer const *shared_op, - Initializer *kernel_initializer, - Initializer *bias_initializer, - char const *name) { - Layer *li = new Layer(this, - OP_LINEAR, - data_type, - name, - 1 /*inputs*/, - use_bias ? 2 : 1 /*weights*/, - 1 /*outputs*/, - input); - { - int numdims = input->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = input->dims[i]; - } - dims[0] = outDim; - li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, data_type, li, 0, true /*create_grad*/); - } - { - int dims[2] = {input->dims[0], outDim}; - li->weights[KERNEL_IDX] = - create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (use_bias) { - int dims[1] = {outDim}; - li->weights[BIAS_IDX] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - bias_initializer, - CHOSEN_SYNC_TYPE); - } - li->add_int_property("use_bias", use_bias); - li->add_int_property("out_dim", outDim); - li->add_int_property("activation", activation); - layers.push_back(li); - return li->outputs[0]; -} +OpTaskInvocation init(LinearAttrs const &attrs) { + OpTaskBinding binding; -Op *Linear::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - long long value; - layer->get_int_property("use_bias", value); - bool use_bias = (bool)value; - layer->get_int_property("out_dim", value); - int outdim = value; - layer->get_int_property("activation", value); - ActiMode activation = (ActiMode)value; - return new Linear(model, - layer->layer_guid, - inputs[0], - outdim, - activation, - use_bias, - layer->data_type, - false /*allocate_weights*/, - layer->name); -} + bind.bind_arg(HANDLE, ff_handle()); + bind.bind_arg(ATTRS, attrs); -// size_t Linear::get_params_hash() const { -// return this->get_params().get_hash(this->inputs[0]); -// } + bind.bind(INPUT, input_tensor(0)); // input + bind.bind(WEIGHT, weight_tensor(0)); // weight + bind.bind(OUTPUT, output_tensor(0)); // output -Linear::Linear(FFModel &model, - Linear const &other, - const ParallelTensor input, - bool allocate_weights) - : Linear(model, - other.layer_guid, - input, - other.out_channels, - other.activation, - other.use_bias, - other.data_type, - allocate_weights, - other.name) {} - -Linear::Linear(FFModel &model, - LinearParams const ¶ms, - ParallelTensor const input, - char const *name, - bool allocate_weights) - : Linear(model, - params.layer_guid, - input, - params.out_channels, - params.activation, - params.use_bias, - params.data_type, - allocate_weights, - name) {} - -Linear::Linear(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor _input, - int out_dim, - ActiMode _activation, - bool _use_bias, - DataType _data_type, - bool allocate_weights, - char const *name) - : Op(model, - OP_LINEAR, - _data_type, - name, - 1 /*inputs*/, - _use_bias ? 2 : 1 /*weights*/, - allocate_weights, - 1 /*outputs*/, - _input), - out_channels(out_dim), activation(_activation), use_bias(_use_bias), - replica(ParallelTensorBase::NO_TENSOR) { - // overwrite layer_guid - layer_guid = _layer_guid; - data_type = _data_type; - auto dimension_names = - this->get_params().get_dimension_names(_input->get_shape()); - this->in_channels = - _input->dims[dimension_names.at(LinearParams::INPUT_CHANNEL)].size; - - ParallelTensorShape input_shape = this->inputs[0]->get_shape(); - ParallelTensorShape output_shape, kernel_shape, bias_shape; - LinearParams params = this->get_params(); - params.construct_mappings(*this->parallel_dims_mapping, input_shape); - params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); + return {LINEAR_INIT_TASK_ID, binding}; +} - if (allocate_weights) { - Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); +OpTaskInvocation forward(LinearAttrs const &attrs) { + OpTaskBinding binding; - weights[KERNEL_IDX] = - model.create_parallel_weight_legion_ordering(kernel_shape.num_dims, - kernel_shape.dims, - _data_type, - NULL /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + bind.bind(INPUT, input_tensor(0)); // input + bind.bind(WEIGHT, weight_tensor(0)); // weight + bind.bind(OUTPUT, output_tensor(0)); // output + bind.bind(BIAS, bias_tensor(0)); // bias - if (use_bias) { - Initializer *bias_initializer = new ZeroInitializer(); + bing.bind_arg(PROFILING, profiling_settings()); + bind.bind_arg(PER_DEVICE_STATE, per_device_state()); + bind.bind_arg(ATTRS, attrs); - weights[BIAS_IDX] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - _data_type, - NULL /*owner_op*/, - true /*create_grad*/, - bias_initializer, - CHOSEN_SYNC_TYPE); - } - } + return {LINEAR_FWD_TASK_ID, binding}; +} - // Create the output tensor - outputs[0] = model.create_parallel_tensor_legion_ordering( - output_shape.num_dims, output_shape.dims, _data_type, this); +OpTaskInvocation backward(LinearAttrs const &attrs) { + OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - assert(check_output_input_weight_parallel_dims(allocate_weights)); + return {LINEAR_BWD_TASK_ID, b}; } -void Linear::init(FFModel const &ff) { - assert(check_output_input_weight_same_parallel_is()); - // assert(check_output_input_weight_same_machine_view()); - parallel_is = outputs[0]->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_init(ff, argmap); - IndexLauncher launcher(LINEAR_INIT_TASK_ID, - parallel_is, - TaskArgument(this, sizeof(Linear)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - // launcher.add_region_requirement( - // RegionRequirement(input_lps[0], 0/*projection id*/, - // READ_ONLY, EXCLUSIVE, inputs[0]->region)); - // launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement( - // RegionRequirement(weights[1]->part, 0/*projection id*/, - // READ_ONLY, EXCLUSIVE, weights[1]->region)); - // launcher.add_field(3, FID_DATA); - if (ff.config.computationMode == COMP_MODE_TRAINING) { - // Add inputs[0].region_grad to avoid Legion warning - // launcher.add_region_requirement( - // RegionRequirement(input_grad_lps[0], 0/*projection id*/, - // WRITE_ONLY, EXCLUSIVE, inputs[0].region_grad)); - // launcher.add_field(2, FID_DATA); - } - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); - set_opmeta_from_futuremap(ff, fm); +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { + auto const &attrs = acc.get_argument(ATTRS); + Allocator allocator = acc.get_allocator(); + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + + auto input = acc.get_tensor(INPUT); + auto weight = acc.get_tensor(WEIGHT); + auto output = acc.get_tensor(OUTPUT); + int out_dim = output.shape.at(ff_dim_t{0}); + int batch_size = output.shape.at.(ff_dim_t{1}); + + float *one_ptr; + + DeviceSpecific state = + acc.create_device_specific( + init_kernel(handle, + allocator, + one_ptr, + attrs.regularizer, + attrs.use_bias, + input.data_type, + weight.data_type, + output.data_type, + batch_size, + attrs.out_channels)); + return state; } -/* - regions[0](O): output - regions[1](I): kernel - regions[2](I): bias -*/ -PerDeviceOpState *Linear::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - switch (out_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - return init_task_with_dim(task, regions, ctx, runtime); - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); - } - return NULL; +static DeviceSpecific + init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + return init_task_impl(acc); } -template -PerDeviceOpState * - Linear::init_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == task->regions.size()); - assert(regions.size() == 2 || regions.size() == 3); - Linear const *linear = (Linear *)task->args; - FFHandler handle = *((FFHandler const *)task->local_args); - // TensorAccessorR acc_input( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - TensorAccessorW acc_kernel(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - // TensorAccessorR acc_bias( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - // int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; - int in_dim = acc_kernel.rect.hi[0] - acc_kernel.rect.lo[0] + 1; - int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; - int batch_size = acc_output.rect.volume() / out_dim; - printf("init linear (input): in_dim(%d) out_dim(%d) batch_size(%d)\n", - in_dim, - out_dim, - batch_size); - LinearMeta *m = new LinearMeta(handle, batch_size); - m->activation = linear->activation; - m->use_bias = linear->use_bias; - m->profiling = linear->profiling; - m->trainableInputs[0] = linear->trainableInputs[0]; - m->input_type = linear->inputs[0]->data_type; - m->weight_type = linear->weights[0]->data_type; - m->output_type = linear->outputs[0]->data_type; - std::strcpy(m->op_name, linear->name); +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto weight = acc.get_tensor(WEIGHT); + auto output = acc.get_tensor(OUTPUT); + auto bias = acc.get_tensor(BIAS); - init_kernel(m, batch_size, out_dim); + auto state = acc.get_device_specific(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto attrs = acc.get_argument(ATTRS); - return m; -} + int in_dim = input.shape.at(ff_dim_t{0}) + 1; + int out_dim = output.shape.at(ff_dim_t{0}) + 1; + int batch_size = output.shape.get_volume() / out_dim; -void Linear::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(LINEAR_FWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - if (use_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); + float const *bias_ptr = NULL; + if (attrs.use_bias) { + bias_ptr = bias.get_float_ptr(); } - runtime->execute_index_space(ctx, launcher); -} -void Linear::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - return forward_task_with_dim(task, regions, ctx, runtime); - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); - } + return profile(forward_kernel, + profiling, + "[Linear] forward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + output.get_float_ptr(), + weight.get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); } -/* - regions[0](I); input - regions[1](O): output - regions[2](I): kernel - regions[3](I): bias -*/ -template -void Linear::forward_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // Linear* linear = (Linear*) task->args; - LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(regions.size() == (3 + static_cast(m->use_bias))); - assert(task->regions.size() == (3 + static_cast(m->use_bias))); - - TensorAccessorR acc_input( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - TensorAccessorW acc_output(regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); - TensorAccessorR acc_kernel( - regions[2], task->regions[2], FID_DATA, ctx, runtime); - int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; - int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; - int batch_size = acc_output.rect.volume() / out_dim; - assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); - assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); - assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); - float const *acc_bias_ptr = NULL; - if (m->use_bias) { - TensorAccessorR acc_bias( - regions[3], task->regions[3], FID_DATA, ctx, runtime); - assert(acc_bias.rect.volume() == static_cast(out_dim)); - acc_bias_ptr = acc_bias.ptr; +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); +}; + +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto weight = acc.get_tensor(WEIGHT); + auto output = acc.get_tensor(OUTPUT); + auto bias = acc.get_tensor(BIAS); + + auto input_grad = acc.get_tensor_grad(INPUT); + auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto attrs = acc.get_argument(ATTRS); + + float const *bias_ptr = NULL; + if (attrs.use_bias) { + bias_ptr = bias.get_float_ptr(); } - forward_kernel_wrapper(m, - acc_input.ptr, - acc_output.ptr, - acc_kernel.ptr, - acc_bias_ptr, - in_dim, - out_dim, - batch_size); + int in_dim = input.shape.at(ff_dim_t{0}) + 1; + int out_dim = output.shape.at(ff_dim_t{0}) + 1; + int batch_size = output.shape.get_volume() / out_dim; + + return profile(backward_kernel, + profiling, + "[Linear] backward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + input_grad.get_float_ptr(), + output.get_float_ptr(), + output_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); } -void Linear::backward(FFModel const &ff) { - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - { - ArgumentMap argmap; - set_argumentmap_for_backward(ff, argmap); - IndexLauncher launcher(LINEAR_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - int rid = 0; - // regions[0](I): input - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(rid++, FID_DATA); - // regions[1](I/O): replica_grad - assert(replica == NULL); - if (trainableInputs[0]) { - launcher.add_region_requirement( - RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); - launcher.add_field(rid++, FID_DATA); - } - // regions[2](I): output - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(rid++, FID_DATA); - // regions[3](I/O): output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - outputs[0]->region_grad)); - launcher.add_field(rid++, FID_DATA); - // regions[4](I): filter - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(rid++, FID_DATA); - // regions[5](I/O): filter_grad - launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[0]->region_grad)); - launcher.add_field(rid++, FID_DATA); - if (use_bias) { - // regions[6](I/O): bias_grad - launcher.add_region_requirement( - RegionRequirement(weights[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[1]->region_grad)); - launcher.add_field(rid++, FID_DATA); - } - runtime->execute_index_space(ctx, launcher); - } - assert(replica == NULL); +static void backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); } -void Linear::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - return backward_task_with_dim(task, regions, ctx, runtime); - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); - } -} +CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, + LinearAttrs const &attrs, + InputParallelTensorDesc const &input, + ProfilingSettings const &settings, + MachineView const &machine_view) { + auto env = sim.new_environment(); -/* - regions[0](I): input - regions[1](I/O): replica_grad or input_grad - regions[2](I): output - regions[3](I/O): output_grad - regions[4](I): filter - regions[5](I/O): filter_grad - regions[6](I/O): bias_grad -*/ -template -void Linear::backward_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // Linear* linear = (Linear*) task->args; - LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + - static_cast(m->use_bias))); - assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + - static_cast(m->use_bias))); - float *input_grad = NULL; - size_t rid = 0; - TensorAccessorR acc_input( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - if (m->trainableInputs[0]) { - Domain domain = runtime->get_index_space_domain( - ctx, task->regions[rid].region.get_index_space()); - if (domain.get_dim() == NDIM + 1) { - assert(domain.get_volume() == acc_input.rect.volume()); - input_grad = helperGetTensorPointerWO( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - } else { - TensorAccessorW acc_replica_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - assert(acc_replica_grad.rect.volume() == acc_input.rect.volume()); - input_grad = acc_replica_grad.ptr; - } - rid++; - } - TensorAccessorR acc_output( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - TensorAccessorW acc_output_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - TensorAccessorR acc_kernel( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - TensorAccessorW acc_kernel_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - // make sure the sizes match - int in_dim = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; - int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; - int batch_size = acc_output.rect.volume() / out_dim; - assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); - assert(acc_output_grad.rect.volume() == - static_cast(out_dim * batch_size)); - assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); - assert(acc_kernel_grad.rect.volume() == - static_cast(in_dim * out_dim)); - float *acc_bias_grad_ptr = NULL; - if (m->use_bias) { - TensorAccessorW acc_bias_grad(regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - assert(acc_bias_grad.rect.volume() == static_cast(out_dim)); - acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); - } - assert(rid == regions.size()); + ParallelTensorShape output_shape = get_output_shape(input.shape, attrs); - backward_kernel_wrapper(m, - acc_input.ptr, - input_grad, - acc_output.ptr, - acc_output_grad.ptr, - acc_kernel.ptr, - acc_kernel_grad.ptr, - acc_bias_grad_ptr, - in_dim, - out_dim, - batch_size); -} + SimTaskBinding init_binding; + init_binding.bind(INPUT, input_tensor(0)); + init_binding.bind(WEIGHT, weight_tensor(0)); + init_binding.bind(BIAS, bias_tensor(0)); + init_binding.bind(OUTPUT, output_tensor(0)); + init_binding.bind_arg(ATTRS, attrs); + init_binding.bind_arg(HANDLE, ff_handle()); -void Linear::print_layer(FFModel const &ff) { - printf("linear layer\n"); - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; + auto init_accessor = env.get_init_accessor(LINEAR_INIT_TASK_ID, init_binding); - RegionRequirement kernel_req( - weights[0]->region, READ_WRITE, EXCLUSIVE, weights[0]->region); - kernel_req.add_field(FID_DATA); - InlineLauncher kernel_launcher(kernel_req); - PhysicalRegion kernel_region = runtime->map_region(ctx, kernel_launcher); - kernel_region.wait_until_valid(); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); - RegionRequirement bias_req( - weights[1]->region, READ_WRITE, EXCLUSIVE, weights[1]->region); - bias_req.add_field(FID_DATA); - InlineLauncher bias_launcher(bias_req); - PhysicalRegion bias_region = runtime->map_region(ctx, bias_launcher); - bias_region.wait_until_valid(); + SimTaskBinding fwd_binding; - TensorAccessorW acc_kernel( - kernel_region, kernel_req, FID_DATA, ctx, runtime, true); - TensorAccessorW acc_bias( - bias_region, bias_req, FID_DATA, ctx, runtime, true); + fwd_bind.bind(INPUT, input_tensor(0)); // input + fwd_bind.bind(WEIGHT, weight_tensor(0)); // weight + fwd_bind.bind(OUTPUT, output_tensor(0)); // output + fwd_bind.bind(BIAS, bias_tensor(0)); // bias - float const *kernel_ptr = acc_kernel.ptr; - float const *bias_ptr = acc_bias.ptr; + fwd_bid.bind_arg(PROFILING, profiling_settings()); + fwd_bind.bind_arg(PER_DEVICE_STATE, per_device_state()); + fwd_bind.bind_arg(ATTRS, attrs); - size_t kernel_size = acc_kernel.rect.volume(); - int kernel_dim1 = acc_kernel.rect.hi[0] - acc_kernel.rect.lo[0] + 1; - int kernel_dim2 = acc_kernel.rect.hi[1] - acc_kernel.rect.lo[1] + 1; - size_t bias_size = acc_bias.rect.volume(); - printf("kernel, %p, %zu, [%d, %d]\n", - kernel_ptr, - kernel_size, - kernel_dim1, - kernel_dim2); - printf("bias, %p, %zu\n", bias_ptr, bias_size); + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - for (size_t i = 0; i < bias_size; i++) { - printf("%f ", bias_ptr[i]); - } - printf("\n"); + auto fwd_accessor = env.get_accessor(LINEAR_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_accessor(LINEAR_BWD_TASK_ID, bwd_binding); - for (size_t i = 0; i < kernel_size; i++) { - printf("%f ", kernel_ptr[i]); - } - printf("\n"); + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); - runtime->unmap_region(ctx, kernel_region); - runtime->unmap_region(ctx, bias_region); + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } -bool Linear::estimate_sync_cost(Simulator *sim, - MachineView const &view, - CostMetrics &cost_metrics) const { - // Estimate the cost of sync weights - ParallelTensorShape tensor_shape; - tensor_shape.num_dims = 3; - tensor_shape.data_type = inputs[0]->data_type; - tensor_shape.dims[0] = inputs[0]->dims[0]; - tensor_shape.dims[1] = inputs[0]->dims[inputs[0]->num_dims - 1]; - tensor_shape.dims[2] = inputs[0]->dims[inputs[0]->num_dims - 2]; - tensor_shape.dims[1].size = out_channels; - tensor_shape.dims[1].degree = 1; - tensor_shape.dims[2].degree = - inputs[0]->dims[1].degree * inputs[0]->dims[2].degree; - tensor_shape.dims[2].size = - inputs[0]->dims[1].degree * inputs[0]->dims[2].degree; - cost_metrics.sync_time = - sim->default_estimate_sync_cost(tensor_shape, view, 1); - // printf("[Estimate Linear] name(%s) sync_time(%.4lf)\n", name, - // cost_metrics.sync_time); - return true; -} +template <> +OpTaskSignature init_signature() { + OpTaskSignature init(OpTaskType::INIT); -ParallelConfig Linear::get_random_parallel_config(FFModel const &ff) const { - if (!ff.config.enable_parameter_parallel) { - return Op::get_random_parallel_config(ff); - } - std::vector batch_candidates; - std::vector channel_candidates; - int batch = outputs[0]->dims[outputs[0]->num_dims - 1].size; - int channel = outputs[0]->dims[0].size; - int total_devices = ff.config.workersPerNode * ff.config.numNodes; - for (int i = 1; i <= ff.config.workersPerNode; i++) { - if (channel % i == 0) { - for (int j = 1; i * j <= total_devices; j++) { - if (batch % j == 0) { - batch_candidates.push_back(j); - channel_candidates.push_back(i); - } - } - } - } - assert(batch_candidates.size() > 0); - int idx = std::rand() % batch_candidates.size(); - int num_par_c = channel_candidates[idx]; - int num_par_b = batch_candidates[idx]; - ParallelConfig pc; - pc.device_type = ParallelConfig::GPU; - pc.nDims = outputs[0]->num_dims; - pc.dim[0] = num_par_c; - pc.dim[pc.nDims - 1] = num_par_b; - for (int i = 1; i < pc.nDims - 1; i++) { - pc.dim[i] = 1; - } - int start_idx = std::rand() % (total_devices - num_par_c * num_par_b + 1); - start_idx = start_idx - start_idx % num_par_c; - for (int i = 0; i < num_par_c * num_par_b; i++) { - pc.device_ids[i] = start_idx + i; - } - return pc; -} + init.add_input_slot(INPUT); + init.add_input_slot(WEIGHT); + init.add_input_slot(BIAS); + init.add_output_slot(OUTPUT); -bool Linear::get_int_parameter(PMParameter para, int *value) const { - switch (para) { - case PM_ACTI: - *value = (int)activation; - return true; - default: - return Op::get_int_parameter(para, value); - } -} + init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); -bool Linear::is_valid_parallel_config(FFModel const &ff, - ParallelConfig const &pc) const { - if (!ff.config.enable_parameter_parallel) { - return Op::is_valid_parallel_config(ff, pc); - } - // Support data and parameter parallel - if (pc.nDims != outputs[0]->num_dims) { - return false; - } - for (int i = 1; i < pc.nDims - 1; i++) { - if (pc.dim[i] != 1) { - return false; - } - } - return true; + init.add_return_value(); + return init, } -bool Linear::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - int input_c = sub_input.dims[0].size; - int input_n = sub_input.get_volume() / input_c; - int output_c = sub_output.dims[0].size; - int output_n = sub_output.get_volume() / output_c; - LinearMeta *m = sim->linear_meta; - m->activation = activation; - m->input_type = inputs[0]->data_type; - m->weight_type = this->data_type; - m->output_type = outputs[0]->data_type; - assert(m->profiling == false); - - init_kernel(m, output_n, output_c); - - // allocate tensors in simulator - sim->free_all(); - void *input_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - void *output_ptr = - sim->allocate(sub_output.get_volume(), outputs[0]->data_type); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - void *kernel_ptr = sim->allocate((size_t)output_c * input_c, this->data_type); - void *bias_ptr = sim->allocate(output_c, this->data_type); - assert(bias_ptr != NULL); - cost_metrics.weights_memory += cost_metrics.total_mem_diff_from(sim->offset); - - bool out_of_memory = (input_ptr == NULL) || (output_ptr == NULL) || - (kernel_ptr == NULL) || (bias_ptr == NULL); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - std::function forward, backward; - forward = [&] { - forward_kernel_wrapper(m, - input_ptr, - output_ptr, - kernel_ptr, - bias_ptr, - input_c, - output_c, - input_n); - }; - if (sim->computationMode == COMP_MODE_TRAINING) { - void *input_grad_ptr = NULL; - if (trainableInputs[0]) { - input_grad_ptr = - sim->allocate(sub_input.get_volume(), inputs[0]->data_type); - } else { - input_grad_ptr = - sim->allocate(sub_input.get_volume(), inputs[0]->data_type); - } - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); - - void *output_grad_ptr = - sim->allocate(sub_output.get_volume(), outputs[0]->data_type); - cost_metrics.outputs_memory += - cost_metrics.total_mem_diff_from(sim->offset); - - void *kernel_grad_ptr = - sim->allocate((size_t)output_c * input_c, this->data_type); - void *bias_grad_ptr = sim->allocate(output_c, this->data_type); - cost_metrics.weights_memory += - cost_metrics.total_mem_diff_from(sim->offset); +template <> +OpTaskSignature fwd_signature() { + OpTaskSignature fwd(OpTaskType::FWD); - out_of_memory = (input_grad_ptr == NULL) || (output_grad_ptr == NULL) || - (kernel_grad_ptr == NULL) || (bias_grad_ptr == NULL); - if (out_of_memory) { - cost_metrics.forward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - cost_metrics.backward_time = Simulator::MAXIMUM_TASK_RUN_TIME; - return true; - } - backward = [&] { - backward_kernel_wrapper(m, - input_ptr, - input_grad_ptr, - output_ptr, - output_grad_ptr, - kernel_ptr, - kernel_grad_ptr, - bias_grad_ptr, - input_c, - output_c, - input_n); - }; - } - - inner_measure_operator_cost(sim, forward, backward, cost_metrics); + fwd.add_input_slot(INPUT); + fwd.add_input_slot(WEIGHT); + fwd.add_input_slot(BIAS); + fwd.add_output_slot(OUTPUT); - if (sim->computationMode == COMP_MODE_TRAINING) { - log_measure.debug("[Measure Linear] name(%s) in(%d %d) out(%d %d) " - "forward_time(%.4lf) backward_time(%.4lf)\n", - name, - input_n, - input_c, - output_n, - output_c, - cost_metrics.forward_time, - cost_metrics.backward_time); - } else { - log_measure.debug( - "[Measure Linear] name(%s) in(%d %d) out(%d %d) forward_time(%.4lf)\n", - name, - input_n, - input_c, - output_n, - output_c, - cost_metrics.forward_time); - } - return true; + fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(ATTRS); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + return fwd; } -void Linear::serialize(Legion::Serializer &sez) const { - sez.serialize(this->layer_guid.id); - sez.serialize(this->out_channels); - sez.serialize(this->activation); - sez.serialize(this->use_bias); - sez.serialize(this->data_type); +template <> +OpTaskSignature bwd_signature() { + OpTaskSignature bwd = + infer_bwd_signature(fwd_signature()); + return bwd; } -/* static */ -using PCG::Node; -Node Linear::deserialize(FFModel &ff, - Legion::Deserializer &dez, - ParallelTensor inputs[], - int num_inputs) { - assert(num_inputs == 1); - int out_channels; - ActiMode activation; - bool use_bias; - DataType data_type; - size_t id; - dez.deserialize(id); - LayerID layer_guid(id); - dez.deserialize(out_channels); - dez.deserialize(activation); - dez.deserialize(use_bias); - dez.deserialize(data_type); +template <> +void register_task() { - LinearParams params; - params.activation = activation; - params.out_channels = out_channels; - params.use_bias = use_bias; - params.data_type = data_type; - params.layer_guid = layer_guid; - return ff.get_or_create_node(inputs[0], params); + register_task(LINEAR_INIT_TASK_ID, + "Linear::init_task", + init_signature(), + init_task); } -LinearParams Linear::get_params() const { - LinearParams params; - params.layer_guid = this->layer_guid; - params.out_channels = this->out_channels; - params.use_bias = this->use_bias; - params.data_type = this->data_type; - params.activation = this->activation; - - return params; +template <> +void register_task() { + register_task(LINEAR_FWD_TASK_ID, + "Linear::fwd_task", + fwd_signature(), + forward_task); } -/* void LinearParams::solve_dims(const ParallelTensor input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* int *output_ndims, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* int *kernel_ndims, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ -/* int *bias_ndims) const { */ -/* this->solve_dims(input->get_shape(), */ -/* output_dims, */ -/* output_ndims, */ -/* kernel_dims, */ -/* kernel_ndims, */ -/* bias_dims, */ -/* bias_ndims); */ -/* } */ - -/* void LinearParams::solve_dims(ParallelTensorShape const &input_shape, */ -/* ParallelTensorShape &output_shape, */ -/* ParallelTensorShape &kernel_shape, */ -/* ParallelTensorShape &bias_shape) const { */ -/* this->solve_dims(input_shape, */ -/* output_shape.dims, */ -/* &output_shape.num_dims, */ -/* kernel_shape.dims, */ -/* &kernel_shape.num_dims, */ -/* bias_shape.dims, */ -/* &bias_shape.num_dims); */ -/* } */ - -/* void LinearParams::solve_dims(ParallelTensorShape const &input_shape, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* int *output_ndims, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* int *kernel_ndims, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ -/* int *bias_ndims) const { */ -/* assert((output_dims == nullptr) == (output_ndims == nullptr)); */ -/* assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); */ -/* assert((bias_dims == nullptr) == (bias_ndims == nullptr)); */ - -/* std::vector mapping; */ -/* this->construct_mappings(mapping, input_shape); */ -/* this->mark_replica_dims(input_shape, output_dims, kernel_dims, bias_dims); - */ - -/* solve_parallel_dim_mappings( */ -/* mapping, {input_shape.dims}, {kernel_dims, bias_dims}, {output_dims}); - */ - -/* this->calculate_nonreplica_dim_sizes(input_shape, */ -/* output_dims, */ -/* output_ndims, */ -/* kernel_dims, */ -/* kernel_ndims, */ -/* bias_dims, */ -/* bias_ndims); */ -/* } */ - -/* std::unordered_map */ -/* LinearParams::get_dimension_names( */ -/* ParallelTensorShape const &input_shape) const { */ -/* int num_dims = input_shape.num_dims; */ - -/* return {{INPUT_CHANNEL, 0}, */ -/* {INPUT_SAMPLE, num_dims - 2}, */ -/* {INPUT_REPLICA, num_dims - 1}, */ -/* {OUTPUT_CHANNEL, 0}, */ -/* {OUTPUT_SAMPLE, num_dims - 2}, */ -/* {OUTPUT_REPLICA, num_dims - 1}, */ -/* {KERNEL_CHANNEL_IN, 0}, */ -/* {KERNEL_CHANNEL_OUT, 1}, */ -/* {BIAS_CHANNEL_OUT, 0}}; */ -/* } */ - -/* void LinearParams::calculate_nonreplica_dim_sizes( */ -/* ParallelTensorShape const &input_shape, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* int *output_ndims, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* int *kernel_ndims, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ -/* int *bias_ndims) const { */ -/* auto dimension_names = this->get_dimension_names(input_shape); */ -/* int num_dims = input_shape.num_dims; */ - -/* if (output_dims != nullptr) { */ -/* for (int i = 1; i < input_shape.num_dims - 1; i++) { */ -/* output_dims[i].size = input_shape.dims[i].size; */ -/* } */ -/* output_dims[dimension_names.at(OUTPUT_CHANNEL)].size = - * this->out_channels; */ -/* *output_ndims = num_dims; */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* kernel_dims[dimension_names.at(KERNEL_CHANNEL_IN)].size = */ -/* input_shape.dims[INPUT_CHANNEL].size / */ -/* input_shape.dims[INPUT_CHANNEL].degree; */ -/* kernel_dims[dimension_names.at(KERNEL_CHANNEL_OUT)].size = */ -/* this->out_channels; */ -/* *kernel_ndims = num_dims; */ -/* } */ -/* if (bias_dims != nullptr) { */ -/* bias_dims[dimension_names.at(BIAS_CHANNEL_OUT)].size = - * this->out_channels; */ -/* *bias_ndims = num_dims; */ -/* } */ -/* } */ - -/* void LinearParams::mark_replica_dims( */ -/* ParallelTensorShape const &input_shape, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* int num_dims = input_shape.num_dims; */ -/* auto dimension_names = this->get_dimension_names(input_shape); */ -/* if (output_dims != nullptr) { */ -/* output_dims[dimension_names.at(OUTPUT_REPLICA)].is_replica_dim = true; */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* for (int i = 2; i < num_dims; i++) { */ -/* kernel_dims[i].is_replica_dim = true; */ -/* } */ -/* } */ -/* if (bias_dims != nullptr) { */ -/* for (int i = 1; i < num_dims; i++) { */ -/* bias_dims[i].is_replica_dim = true; */ -/* } */ -/* } */ -/* } */ +template <> +void register_task() { + register_task(LINEAR_BWD_TASK_ID, + "Linear::bwd_task", + bwd_signature(), + backward_task); +} }; // namespace FlexFlow - -namespace std { -size_t hash::operator()( - FlexFlow::LinearParams const ¶ms) const { - size_t key = 0; - hash_combine(key, params.layer_guid.id); - hash_combine(key, params.out_channels); - hash_combine(key, params.use_bias); - hash_combine(key, params.data_type); - hash_combine(key, params.activation); - return key; -} -}; // namespace std diff --git a/lib/runtime/src/ops/linear.h b/lib/runtime/src/ops/linear.h index 55f719ba77..2b476382ef 100644 --- a/lib/runtime/src/ops/linear.h +++ b/lib/runtime/src/ops/linear.h @@ -20,7 +20,7 @@ OpTaskInvocation backward(LinearAttrs const &); CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, LinearAttrs const &attrs, - ParallelTensorShape const &input_shape, + InputParallelTensorDesc const &input, ProfilingSettings const &settings, MachineView const &machine_view);