Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 92 additions & 45 deletions src/operator/optimizer_op-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,39 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs,

/*! \brief kernel for sparse sgd
*/
template<int req, typename xpu>
struct SGDDnsRspKernel;

template<int req>
struct SGDDnsRspKernel {
struct SGDDnsRspKernel<req, gpu> {
// DType is the output data type
// IType is row sparse idx type
// i is the ith element in row sparse gradient
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight,
const IType* grad_idx, const DType *grad_val,
const DType clip_gradient, const DType lr,
const DType wd, const DType rescale_grad) {
using nnvm::dim_t;
using namespace mshadow_op;
const dim_t row_id = i / row_length;
const dim_t col_id = i % row_length;
const dim_t row_offset = grad_idx[row_id] * row_length;
const dim_t data_i = row_offset + col_id;
if (clip_gradient >= 0.0f) {
KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
(lr) * mshadow_op::clip::Map(rescale_grad * grad_val[i], clip_gradient));
} else {
KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
(lr * rescale_grad) * grad_val[i]);
}
}
};

/*! \brief kernel for sparse sgd
*/
template<int req>
struct SGDDnsRspKernel<req, cpu> {
// DType is the output data type
// IType is row sparse idx type
// i is the ith row in row sparse gradient
Expand Down Expand Up @@ -154,11 +185,15 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
DType* weight_data = weight.dptr<DType>();
IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
DType* grad_val = grad.data().dptr<DType>();
index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0];
auto row_length = weight.shape_.ProdShape(1, weight.ndim());
Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
const IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
const DType* grad_val = grad.data().dptr<DType>();
const nnvm::dim_t num_rows = grad.aux_shape(rowsparse::kIdx)[0];
const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
size_t num_threads = num_rows;
if (std::is_same<xpu, gpu>::value) {
num_threads = num_rows * row_length;
}
Kernel<SGDDnsRspKernel<req_type, xpu>, xpu>::Launch(s, num_threads, row_length,
out->dptr<DType>(), weight_data, grad_idx, grad_val,
static_cast<DType>(param.clip_gradient),
static_cast<DType>(param.lr), static_cast<DType>(param.wd),
Expand Down Expand Up @@ -375,8 +410,11 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs,
});
}

template<int req, typename xpu>
struct SGDMomDnsRspDnsKernel;

template<int req>
struct SGDMomDnsRspDnsKernel {
struct SGDMomDnsRspDnsKernel<req, cpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
Expand All @@ -402,6 +440,33 @@ struct SGDMomDnsRspDnsKernel {
}
};

template<int req>
struct SGDMomDnsRspDnsKernel<req, gpu> {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const DType clip_gradient, const DType momentum,
const DType lr, const DType wd, const DType rescale_grad) {
using nnvm::dim_t;
const DType rate = lr * wd;
const dim_t row_id = i / row_length;
const dim_t col_id = i % row_length;
const dim_t data_i = grad_idx[row_id] * row_length + col_id;
if (clip_gradient >= 0.0f) {
mom_data[data_i] = momentum * mom_data[data_i]
- rate * weight_data[data_i]
- lr *
mshadow_op::clip::Map(rescale_grad * grad_data[i],
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why a line break here?

clip_gradient);
} else {
mom_data[data_i] = momentum * mom_data[data_i]
- rate * weight_data[data_i]
- lr * rescale_grad * grad_data[i];
}
KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
}
};

template<typename xpu>
inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
const OpContext& ctx,
Expand All @@ -428,7 +493,11 @@ inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
DType* out_data = out->dptr<DType>();
index_t num_rows = grad.aux_shape(kIdx)[0];
auto row_length = weight.shape_.ProdShape(1, weight.ndim());
Kernel<SGDMomDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
size_t num_threads = num_rows;
if (std::is_same<xpu, gpu>::value) {
num_threads = num_rows * row_length;
}
Kernel<SGDMomDnsRspDnsKernel<req_type, xpu>, xpu>::Launch(s, num_threads, row_length,
out_data, mom_data, weight_data, grad_idx, grad_val,
static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
static_cast<DType>(param.lr), static_cast<DType>(param.wd),
Expand Down Expand Up @@ -476,63 +545,41 @@ inline bool StdOptStorageType(const nnvm::NodeAttrs& attrs,
DispatchMode* dispatch_mode,
std::vector<int>* in_attrs,
std::vector<int>* out_attrs) {
using namespace common;
CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_rsp + n_rsp_dns));
CHECK_EQ(out_attrs->size(), 1U);
bool dispatched = false;

if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
if (!dispatched && ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
// dns, ... -> dns
dispatched = storage_type_assign(out_attrs, kDefaultStorage,
dispatch_mode, DispatchMode::kFCompute);
}
const std::vector<int> rsp_stypes(in_attrs->begin(), in_attrs->begin() + n_rsp);
const std::vector<int> rsp_dns_stypes(in_attrs->begin() + n_rsp, in_attrs->end());
if (!dispatched && common::ContainsOnlyStorage(rsp_stypes, kRowSparseStorage) &&
(common::ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage) ||
common::ContainsOnlyStorage(rsp_dns_stypes, kDefaultStorage))) {
if (!dispatched && ContainsOnlyStorage(rsp_stypes, kRowSparseStorage) &&
(ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage) ||
ContainsOnlyStorage(rsp_dns_stypes, kDefaultStorage))) {
// rsp, ..., rsp/dns, ... -> rsp
dispatched = storage_type_assign(out_attrs, kRowSparseStorage,
dispatch_mode, DispatchMode::kFComputeEx);
// warn users if lazy_update is turned on
if (dispatched && ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage)) {
LogOnce("Optimizer with lazy_update = True detected. "
"Be aware that lazy update is different from standard update, "
"and may lead to different empirical results. See "
"https://mxnet.incubator.apache.org/api/python/optimization/optimization.html "
"for more details.");
}
}

if (!dispatched) {
dispatched = dispatch_fallback(out_attrs, dispatch_mode);
}
return dispatched;
}

template<int req>
struct SGDMomStdDnsRspDnsKernel {
template<typename DType, typename IType, typename RType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
const DType rate = lr * wd;
const bool non_zero = (i == 0) ? prefix_sum[0] > 0
: prefix_sum[i] > prefix_sum[i-1];
template<int req, typename xpu>
struct SGDMomStdDnsRspDnsKernel;

const index_t row_i = i * row_length;
const RType grad_i = (prefix_sum[i]-1) * row_length;
for (index_t j = 0; j < row_length; j++) {
const index_t data_i = row_i + j;
const DType grad = non_zero ? grad_data[grad_i + j]
: static_cast<DType>(0);
if (clip_gradient >= 0.0f) {
mom_data[data_i] = momentum * mom_data[data_i]
- rate * weight_data[data_i]
- lr *
mshadow_op::clip::Map(rescale_grad * grad,
clip_gradient);
} else {
mom_data[data_i] = momentum * mom_data[data_i]
- rate * weight_data[data_i]
- lr * rescale_grad * grad;
}
KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
}
}
};

template<typename xpu>
void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param,
Expand Down
34 changes: 33 additions & 1 deletion src/operator/optimizer_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,38 @@ Where the parameter ``momentum`` is the decay rate of momentum estimates at each
.add_argument("mom", "NDArray-or-Symbol", "Momentum")
.add_arguments(SignumParam::__FIELDS__());

template<int req>
struct SGDMomStdDnsRspDnsKernel<req, cpu> {
template<typename DType, typename IType, typename RType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
const DType rate = lr * wd;
const bool non_zero = (i == 0) ? prefix_sum[0] > 0
: prefix_sum[i] > prefix_sum[i-1];

const index_t row_i = i * row_length;
const RType grad_i = (prefix_sum[i]-1) * row_length;
for (index_t j = 0; j < row_length; j++) {
const index_t data_i = row_i + j;
const DType grad = non_zero ? grad_data[grad_i + j]
: static_cast<DType>(0);
if (clip_gradient >= 0.0f) {
mom_data[data_i] = momentum * mom_data[data_i]
- rate * weight_data[data_i]
- lr *
mshadow_op::clip::Map(rescale_grad * grad,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why line break here?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no particular reason. No new line will make it a 200 character line.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean the extra line break for lr * mshadow_op::clip::Map, these two places are inconsistent with what you have on line 52 of optimizer_op.cu below.

Copy link
Copy Markdown
Member Author

@eric-haibin-lin eric-haibin-lin Apr 2, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't think it's necessary to add/remove that extra line break. Please provide constructive feedbacks/review comments

clip_gradient);
} else {
mom_data[data_i] = momentum * mom_data[data_i]
- rate * weight_data[data_i]
- lr * rescale_grad * grad;
}
KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
}
}
};

template<>
void SGDMomStdUpdateDnsRspDnsImpl<cpu>(const SGDMomParam& param,
Expand Down Expand Up @@ -139,7 +171,7 @@ void SGDMomStdUpdateDnsRspDnsImpl<cpu>(const SGDMomParam& param,
prefix_sum[i] += prefix_sum[i - 1];
}
}
Kernel<SGDMomStdDnsRspDnsKernel<req_type>, cpu>::Launch(s, num_rows, row_length,
Kernel<SGDMomStdDnsRspDnsKernel<req_type, cpu>, cpu>::Launch(s, num_rows, row_length,
out_data, mom_data, weight_data, grad_idx, grad_val, prefix_sum,
static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
static_cast<DType>(param.lr), static_cast<DType>(param.wd),
Expand Down
32 changes: 31 additions & 1 deletion src/operator/optimizer_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,35 @@
namespace mxnet {
namespace op {

template<int req>
struct SGDMomStdDnsRspDnsKernel<req, gpu> {
template<typename DType, typename IType, typename RType>
MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
DType* mom_data, const DType* weight_data, const IType* grad_idx,
const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
using nnvm::dim_t;
const DType rate = lr * wd;
const dim_t row_id = i / row_length;
const dim_t col_id = i % row_length;
const dim_t nnr = prefix_sum[row_id];
const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
: nnr > prefix_sum[row_id - 1];
const RType grad_i = (nnr - 1) * row_length + col_id;
const DType grad = non_zero ? grad_data[grad_i]
: static_cast<DType>(0);
if (clip_gradient >= 0.0f) {
mom_data[i] = momentum * mom_data[i]
- rate * weight_data[i]
- lr * mshadow_op::clip::Map(rescale_grad * grad, clip_gradient);
} else {
mom_data[i] = momentum * mom_data[i]
- rate * weight_data[i] - lr * rescale_grad * grad;
}
KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
}
};

template<>
void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
const OpContext& ctx,
Expand Down Expand Up @@ -84,7 +113,8 @@ void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
num_rows,
mshadow::Stream<gpu>::GetStream(s));
}
Kernel<SGDMomStdDnsRspDnsKernel<req_type>, gpu>::Launch(s, num_rows, row_length,
size_t num_threads = num_rows * row_length;
Kernel<SGDMomStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(s, num_threads, row_length,
out_data, mom_data, weight_data, grad_idx, grad_val, prefix_sum,
static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
static_cast<DType>(param.lr), static_cast<DType>(param.wd),
Expand Down