From 72f22dd32fa0ae7679d3abfe501a91c0c9ee001c Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Sun, 21 Apr 2024 14:22:41 -0400 Subject: [PATCH 1/4] add optimizer, partition, pool --- lib/kernels/src/hip/optimizer_kernel.cpp | 9 +- lib/kernels/src/hip/partition_kernels.cpp | 24 ++-- lib/kernels/src/hip/pool_2d_kernels.cpp | 136 +++++++++++----------- 3 files changed, 87 insertions(+), 82 deletions(-) diff --git a/lib/kernels/src/hip/optimizer_kernel.cpp b/lib/kernels/src/hip/optimizer_kernel.cpp index c22ecd7f5a..164f9acd89 100644 --- a/lib/kernels/src/hip/optimizer_kernel.cpp +++ b/lib/kernels/src/hip/optimizer_kernel.cpp @@ -13,16 +13,11 @@ * limitations under the License. */ -#include "flexflow/accessor.h" -#include "flexflow/model.h" -#include "flexflow/optimizer.h" -#include "utils/hip_helper.h" +#include "kernels/optimizer_kernels.h" #include namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); - __global__ void sgd_update(size_t count, float lr, float weight_decay, @@ -87,6 +82,7 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, #ifdef FF_USE_NCCL __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, + PerDeviceOpState const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -208,6 +204,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, #ifdef FF_USE_NCCL __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, + PerDeviceOpState const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, diff --git a/lib/kernels/src/hip/partition_kernels.cpp b/lib/kernels/src/hip/partition_kernels.cpp index 3761da5c84..4591247faa 100644 --- a/lib/kernels/src/hip/partition_kernels.cpp +++ b/lib/kernels/src/hip/partition_kernels.cpp @@ -14,21 +14,17 @@ */ #include "kernels/partition_kernels.h" +#include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/hip_helper.h" #include namespace FlexFlow { - -RepartitionPerDeviceState::RepartitionPerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) {} - namespace Kernels { namespace Repartition { tempate struct ForwardKernel { void operator()(hipStream_t stream, - RepartitionPerDeviceState const *m, + RepartitionPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { checkCUDA(hipMemcpyAsync(output.get(), @@ -41,7 +37,7 @@ tempate struct ForwardKernel { tempate struct BackwardKernel { void operator()(hipStream_t stream, - RepartitionPerDeviceState const *m, + RepartitionPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { hipLaunchKernelGGL(HIP_KERNEL_NAME(add_kernel), @@ -55,19 +51,25 @@ tempate struct BackwardKernel { } } +RepartitionPerDeviceState + init_kernel(PerDeviceFFHandle const &handle, DataType data_type) { + RepartitionPerDeviceState per_device_state = {handle, data_type}; + return per_device_state; +} + void forward_kernel(hipStream_t stream, - RepartitionPerDeviceState const *m, + RepartitionPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output) + DataTypeDispatch1{}(m.data_type, stream, m, input, output) } void backward_kernel(hipStream_t stream, - RepartitionPerDeviceState const *m, + RepartitionPerDeviceState const &m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}( - m->data_type, stream, m, input_grad, output_grad) + m.data_type, stream, m, input_grad, output_grad) } } // namespace Repartition diff --git a/lib/kernels/src/hip/pool_2d_kernels.cpp b/lib/kernels/src/hip/pool_2d_kernels.cpp index 0bb44c3e1a..ee343802cc 100644 --- a/lib/kernels/src/hip/pool_2d_kernels.cpp +++ b/lib/kernels/src/hip/pool_2d_kernels.cpp @@ -14,116 +14,122 @@ */ #include "kernels/pool_2d_kernels.h" -#include "kernels/hip_helper.h" +#include "device.h" +#include namespace FlexFlow { -Pool2DPerDeviceState::Pool2DPerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) { +namespace Kernels { +namespace Pool2D { + +Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + optional activation, + int input_w, + int input_h, + int input_c, + int input_n, + int output_w, + int output_h, + int output_c, + int output_n, + int pad_h, + int pad_w, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + PoolOp pool_type) { + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffPoolingDescriptor_t poolDesc; + ffActivationDescriptor_t actiDesc; + checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc)); -} + checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); -namespace Kernels { -namespace Pool2D { - -void init_kernel(Pool2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int pad_h, - int pad_w, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - PoolType pool_type) { checkCUDNN(miopenSet4dTensorDescriptor( - m->inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); - - miopenPoolingMode_t mode; - if (pool_type == POOL_MAX) { - mode = miopenPoolingMax; + inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); + cudnnPoolingMode_t mode; + if (pool_type == PoolOp::MAX) { + mode = MIOPEN_POOLING_MAX; } else { - assert(pool_type == POOL_AVG); - mode = miopenPoolingAverage; + assert(pool_type == PoolOp::AVG); + mode = MIOPEN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; } - checkCUDNN(miopenSet2dPoolingDescriptor( - m->poolDesc, mode, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w)); + + checkCUDNN(miopenSetPooling2dDescriptor( + poolDesc, mode, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w)); + int n, c, h, w; - checkCUDNN(miopenGetPoolingForwardOutputDim( - m->poolDesc, m->inputTensor, &n, &c, &h, &w)); + checkCUDNN(miopenGetPooling2dForwardOutputDim( + poolDesc, inputTensor, &n, &c, &h, &w)); assert(n == output_n); assert(c == output_c); assert(h == output_h); assert(w == output_w); checkCUDNN( - miopenSet4dTensorDescriptor(m->outputTensor, miopenFloat, n, c, h, w)); + miopenSet4dTensorDescriptor(outputTensor, miopenFloat, n, c, h, w)); + bool relu = false; + if (activation == Activation::RELU) { + relu = true; + } + Pool2DPerDeviceState state = { + handle, + inputTensor, + outputTensor, + actiDesc, + poolDesc, + relu, + }; + return state; } void forward_kernel(hipStream_t stream, - Pool2DPerDeviceState const *m, + Pool2DPerDeviceState const &m, void const *input_ptr, void *output_ptr) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + checkCUDNN(miopenSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(miopenPoolingForward(m->handle.dnn, - m->poolDesc, + checkCUDNN(miopenPoolingForward(m.handle.dnn, + m.poolDesc, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr, true, - m->handle.workSpace, - m->handle.workSpaceSize)); - if (m->profiling) { - hipEventRecord(t_end, stream); - checkCUDA(hipEventSynchronize(t_end)); - // print_tensor<4, float>(acc_input.ptr, acc_input.rect, - // "[Pool2D:forward:input]"); print_tensor<4, float>(acc_output.ptr, - // acc_output.rect, "[Pool2D:forward:output]"); - float elapsed = 0; - checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); - hipEventDestroy(t_start); - hipEventDestroy(t_end); - printf("%s [Pool2D] forward time = %.2fms\n", m->op_name, elapsed); - } + m.handle.workSpace, + m.handle.workSpaceSize)); } void backward_kernel(hipStream_t stream, - Pool2DPerDeviceState const *m, + Pool2DPerDeviceState const &m, void const *input_ptr, void *input_grad_ptr, void const *output_ptr, void const *output_grad_ptr) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + checkCUDNN(miopenSetStream(m.handle.dnn, stream)); float alpha = 1.0f; - float beta = 0.0f; - checkCUDNN(miopenPoolingBackward(m->handle.dnn, - m->poolDesc, + checkCUDNN(miopenPoolingBackward(m.handle.dnn, + m.poolDesc, &alpha, - m->outputTensor, + m.outputTensor, output_ptr, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->inputTensor, + m.inputTensor, input_ptr, &beta, - m->inputTensor, + m.inputTensor, input_grad_ptr, - m->handle.workSpace)); + m.handle.workSpace)); } } // namespace Pool2D From 968964dade6e422373880e39b69b0b017c70c307 Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Tue, 21 May 2024 16:42:23 -0400 Subject: [PATCH 2/4] fix mode typo --- lib/kernels/src/hip/pool_2d_kernels.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kernels/src/hip/pool_2d_kernels.cpp b/lib/kernels/src/hip/pool_2d_kernels.cpp index ee343802cc..5ab3512112 100644 --- a/lib/kernels/src/hip/pool_2d_kernels.cpp +++ b/lib/kernels/src/hip/pool_2d_kernels.cpp @@ -51,12 +51,12 @@ Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, checkCUDNN(miopenSet4dTensorDescriptor( inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); - cudnnPoolingMode_t mode; + miopenPoolingMode_t mode; if (pool_type == PoolOp::MAX) { mode = MIOPEN_POOLING_MAX; } else { assert(pool_type == PoolOp::AVG); - mode = MIOPEN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + mode = MIOPEN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; } checkCUDNN(miopenSetPooling2dDescriptor( From f25e728ae21d7b9bda44e317be1fb8297df141cb Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Tue, 21 May 2024 16:50:41 -0400 Subject: [PATCH 3/4] reverted to the original optimizer in repo-refactor branch --- lib/kernels/src/hip/optimizer_kernel.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/kernels/src/hip/optimizer_kernel.cpp b/lib/kernels/src/hip/optimizer_kernel.cpp index 164f9acd89..c22ecd7f5a 100644 --- a/lib/kernels/src/hip/optimizer_kernel.cpp +++ b/lib/kernels/src/hip/optimizer_kernel.cpp @@ -13,11 +13,16 @@ * limitations under the License. */ -#include "kernels/optimizer_kernels.h" +#include "flexflow/accessor.h" +#include "flexflow/model.h" +#include "flexflow/optimizer.h" +#include "utils/hip_helper.h" #include namespace FlexFlow { +LegionRuntime::Logger::Category log_optimizer("optimizer"); + __global__ void sgd_update(size_t count, float lr, float weight_decay, @@ -82,7 +87,6 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, #ifdef FF_USE_NCCL __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, - PerDeviceOpState const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, @@ -204,7 +208,6 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, #ifdef FF_USE_NCCL __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, - PerDeviceOpState const *meta, float const *w_grad_ptr, size_t size, float *w_ptr, From 288f3829600b7caa1f032a92716ffbe8e72d9161 Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Fri, 24 May 2024 16:57:14 -0400 Subject: [PATCH 4/4] fix pooling mode --- lib/kernels/src/hip/pool_2d_kernels.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kernels/src/hip/pool_2d_kernels.cpp b/lib/kernels/src/hip/pool_2d_kernels.cpp index 5ab3512112..ed942c105c 100644 --- a/lib/kernels/src/hip/pool_2d_kernels.cpp +++ b/lib/kernels/src/hip/pool_2d_kernels.cpp @@ -53,10 +53,10 @@ Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); miopenPoolingMode_t mode; if (pool_type == PoolOp::MAX) { - mode = MIOPEN_POOLING_MAX; + mode = miopenPoolingMax; } else { assert(pool_type == PoolOp::AVG); - mode = MIOPEN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + mode = miopenPoolingAverage; } checkCUDNN(miopenSetPooling2dDescriptor(