From 778ffeeecba4ed7cf5ef6114507d5d6b89a31a9a Mon Sep 17 00:00:00 2001 From: ZiyueHuang Date: Wed, 28 Mar 2018 05:04:22 +0000 Subject: [PATCH 1/2] gpu kernels --- src/operator/optimizer_op-inl.h | 127 ++++++++++++++++++++++---------- src/operator/optimizer_op.cc | 34 ++++++++- src/operator/optimizer_op.cu | 32 +++++++- 3 files changed, 151 insertions(+), 42 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 104f20a61eeb..d56b9806b735 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -106,10 +106,41 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs, }); } +/*! \brief kernel for sparse sgd + */ +template +struct SGDDnsRspKernel; + +template +struct SGDDnsRspKernel { + // DType is the output data type + // IType is row sparse idx type + // i is the ith element in row sparse gradient + template + MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight, + const IType* grad_idx, const DType *grad_val, + const DType clip_gradient, const DType lr, + const DType wd, const DType rescale_grad) { + using nnvm::dim_t; + using namespace mshadow_op; + const dim_t row_id = i / row_length; + const dim_t col_id = i % row_length; + const dim_t row_offset = grad_idx[row_id] * row_length; + const dim_t data_i = row_offset + col_id; + if (clip_gradient >= 0.0f) { + KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] - + (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[i], clip_gradient)); + } else { + KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] - + (lr * rescale_grad) * grad_val[i]); + } + } +}; + /*! \brief kernel for sparse sgd */ template -struct SGDDnsRspKernel { +struct SGDDnsRspKernel { // DType is the output data type // IType is row sparse idx type // i is the ith row in row sparse gradient @@ -154,11 +185,15 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param, MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, { MXNET_ASSIGN_REQ_SWITCH(req, req_type, { DType* weight_data = weight.dptr(); - IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr(); - DType* grad_val = grad.data().dptr(); - index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0]; - auto row_length = weight.shape_.ProdShape(1, weight.ndim()); - Kernel, xpu>::Launch(s, num_rows, row_length, + const IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr(); + const DType* grad_val = grad.data().dptr(); + const nnvm::dim_t num_rows = grad.aux_shape(rowsparse::kIdx)[0]; + const auto row_length = weight.shape_.ProdShape(1, weight.ndim()); + size_t num_threads = num_rows; + if (std::is_same::value) { + num_threads = num_rows * row_length; + } + Kernel, xpu>::Launch(s, num_threads, row_length, out->dptr(), weight_data, grad_idx, grad_val, static_cast(param.clip_gradient), static_cast(param.lr), static_cast(param.wd), @@ -375,8 +410,11 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs, }); } +template +struct SGDMomDnsRspDnsKernel; + template -struct SGDMomDnsRspDnsKernel { +struct SGDMomDnsRspDnsKernel { template MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, DType* mom_data, const DType* weight_data, const IType* grad_idx, @@ -402,6 +440,33 @@ struct SGDMomDnsRspDnsKernel { } }; +template +struct SGDMomDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, + DType* mom_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const DType clip_gradient, const DType momentum, + const DType lr, const DType wd, const DType rescale_grad) { + using nnvm::dim_t; + const DType rate = lr * wd; + const dim_t row_id = i / row_length; + const dim_t col_id = i % row_length; + const dim_t data_i = grad_idx[row_id] * row_length + col_id; + if (clip_gradient >= 0.0f) { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * + mshadow_op::clip::Map(rescale_grad * grad_data[i], + clip_gradient); + } else { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * rescale_grad * grad_data[i]; + } + KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]); + } +}; + template inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param, const OpContext& ctx, @@ -428,7 +493,11 @@ inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param, DType* out_data = out->dptr(); index_t num_rows = grad.aux_shape(kIdx)[0]; auto row_length = weight.shape_.ProdShape(1, weight.ndim()); - Kernel, xpu>::Launch(s, num_rows, row_length, + size_t num_threads = num_rows; + if (std::is_same::value) { + num_threads = num_rows * row_length; + } + Kernel, xpu>::Launch(s, num_threads, row_length, out_data, mom_data, weight_data, grad_idx, grad_val, static_cast(param.clip_gradient), static_cast(param.momentum), static_cast(param.lr), static_cast(param.wd), @@ -493,46 +562,24 @@ inline bool StdOptStorageType(const nnvm::NodeAttrs& attrs, // rsp, ..., rsp/dns, ... -> rsp dispatched = storage_type_assign(out_attrs, kRowSparseStorage, dispatch_mode, DispatchMode::kFComputeEx); + // warn users if lazy_update is turned on + if (dispatched) { + common::LogOnce(attrs.name + " with lazy_update = True detected. " + "Be aware that lazy update is different from standard update, " + "and may lead to different empirical results. See " + "https://mxnet.incubator.apache.org/api/python/optimization/optimization.html " + "for more details."); + } } - if (!dispatched) { dispatched = dispatch_fallback(out_attrs, dispatch_mode); } return dispatched; } -template -struct SGDMomStdDnsRspDnsKernel { - template - MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, - DType* mom_data, const DType* weight_data, const IType* grad_idx, - const DType* grad_data, const RType* prefix_sum, const DType clip_gradient, - const DType momentum, const DType lr, const DType wd, const DType rescale_grad) { - const DType rate = lr * wd; - const bool non_zero = (i == 0) ? prefix_sum[0] > 0 - : prefix_sum[i] > prefix_sum[i-1]; +template +struct SGDMomStdDnsRspDnsKernel; - const index_t row_i = i * row_length; - const RType grad_i = (prefix_sum[i]-1) * row_length; - for (index_t j = 0; j < row_length; j++) { - const index_t data_i = row_i + j; - const DType grad = non_zero ? grad_data[grad_i + j] - : static_cast(0); - if (clip_gradient >= 0.0f) { - mom_data[data_i] = momentum * mom_data[data_i] - - rate * weight_data[data_i] - - lr * - mshadow_op::clip::Map(rescale_grad * grad, - clip_gradient); - } else { - mom_data[data_i] = momentum * mom_data[data_i] - - rate * weight_data[data_i] - - lr * rescale_grad * grad; - } - KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]); - } - } -}; template void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param, diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index f7ccbbb739d6..7d87e2c94134 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -98,6 +98,38 @@ Where the parameter ``momentum`` is the decay rate of momentum estimates at each .add_argument("mom", "NDArray-or-Symbol", "Momentum") .add_arguments(SignumParam::__FIELDS__()); +template +struct SGDMomStdDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, + DType* mom_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const RType* prefix_sum, const DType clip_gradient, + const DType momentum, const DType lr, const DType wd, const DType rescale_grad) { + const DType rate = lr * wd; + const bool non_zero = (i == 0) ? prefix_sum[0] > 0 + : prefix_sum[i] > prefix_sum[i-1]; + + const index_t row_i = i * row_length; + const RType grad_i = (prefix_sum[i]-1) * row_length; + for (index_t j = 0; j < row_length; j++) { + const index_t data_i = row_i + j; + const DType grad = non_zero ? grad_data[grad_i + j] + : static_cast(0); + if (clip_gradient >= 0.0f) { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * + mshadow_op::clip::Map(rescale_grad * grad, + clip_gradient); + } else { + mom_data[data_i] = momentum * mom_data[data_i] + - rate * weight_data[data_i] + - lr * rescale_grad * grad; + } + KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]); + } + } +}; template<> void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param, @@ -139,7 +171,7 @@ void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param, prefix_sum[i] += prefix_sum[i - 1]; } } - Kernel, cpu>::Launch(s, num_rows, row_length, + Kernel, cpu>::Launch(s, num_rows, row_length, out_data, mom_data, weight_data, grad_idx, grad_val, prefix_sum, static_cast(param.clip_gradient), static_cast(param.momentum), static_cast(param.lr), static_cast(param.wd), diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 18ee66a729c2..0fd2ca83fda4 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -29,6 +29,35 @@ namespace mxnet { namespace op { +template +struct SGDMomStdDnsRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data, + DType* mom_data, const DType* weight_data, const IType* grad_idx, + const DType* grad_data, const RType* prefix_sum, const DType clip_gradient, + const DType momentum, const DType lr, const DType wd, const DType rescale_grad) { + using nnvm::dim_t; + const DType rate = lr * wd; + const dim_t row_id = i / row_length; + const dim_t col_id = i % row_length; + const dim_t nnr = prefix_sum[row_id]; + const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0 + : nnr > prefix_sum[row_id - 1]; + const RType grad_i = (nnr - 1) * row_length + col_id; + const DType grad = non_zero ? grad_data[grad_i] + : static_cast(0); + if (clip_gradient >= 0.0f) { + mom_data[i] = momentum * mom_data[i] + - rate * weight_data[i] + - lr * mshadow_op::clip::Map(rescale_grad * grad, clip_gradient); + } else { + mom_data[i] = momentum * mom_data[i] + - rate * weight_data[i] - lr * rescale_grad * grad; + } + KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]); + } +}; + template<> void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param, const OpContext& ctx, @@ -84,7 +113,8 @@ void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param, num_rows, mshadow::Stream::GetStream(s)); } - Kernel, gpu>::Launch(s, num_rows, row_length, + size_t num_threads = num_rows * row_length; + Kernel, gpu>::Launch(s, num_threads, row_length, out_data, mom_data, weight_data, grad_idx, grad_val, prefix_sum, static_cast(param.clip_gradient), static_cast(param.momentum), static_cast(param.lr), static_cast(param.wd), From 1a324734d571671d8ecd3a696d826f960dd75cd8 Mon Sep 17 00:00:00 2001 From: ZiyueHuang Date: Wed, 28 Mar 2018 05:46:01 +0000 Subject: [PATCH 2/2] update warning msg --- src/operator/optimizer_op-inl.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index d56b9806b735..66e5059d6ea5 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -545,26 +545,26 @@ inline bool StdOptStorageType(const nnvm::NodeAttrs& attrs, DispatchMode* dispatch_mode, std::vector* in_attrs, std::vector* out_attrs) { + using namespace common; CHECK_EQ(in_attrs->size(), static_cast(n_rsp + n_rsp_dns)); CHECK_EQ(out_attrs->size(), 1U); bool dispatched = false; - - if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { + if (!dispatched && ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { // dns, ... -> dns dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); } const std::vector rsp_stypes(in_attrs->begin(), in_attrs->begin() + n_rsp); const std::vector rsp_dns_stypes(in_attrs->begin() + n_rsp, in_attrs->end()); - if (!dispatched && common::ContainsOnlyStorage(rsp_stypes, kRowSparseStorage) && - (common::ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage) || - common::ContainsOnlyStorage(rsp_dns_stypes, kDefaultStorage))) { + if (!dispatched && ContainsOnlyStorage(rsp_stypes, kRowSparseStorage) && + (ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage) || + ContainsOnlyStorage(rsp_dns_stypes, kDefaultStorage))) { // rsp, ..., rsp/dns, ... -> rsp dispatched = storage_type_assign(out_attrs, kRowSparseStorage, dispatch_mode, DispatchMode::kFComputeEx); // warn users if lazy_update is turned on - if (dispatched) { - common::LogOnce(attrs.name + " with lazy_update = True detected. " + if (dispatched && ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage)) { + LogOnce("Optimizer with lazy_update = True detected. " "Be aware that lazy update is different from standard update, " "and may lead to different empirical results. See " "https://mxnet.incubator.apache.org/api/python/optimization/optimization.html "