From dd989d68c0830d7beaac22bd189003e7cd8cba50 Mon Sep 17 00:00:00 2001 From: XiaotaoChen Date: Sat, 2 Jun 2018 11:58:55 +0800 Subject: [PATCH 1/3] implement dot(dns, csr/csr.T)=dns on cpu --- src/operator/tensor/dot-inl.h | 141 +++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 9 deletions(-) diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h index ffdb706e5e3c..55f7bc9bb1e2 100644 --- a/src/operator/tensor/dot-inl.h +++ b/src/operator/tensor/dot-inl.h @@ -264,11 +264,15 @@ inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs, if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kCSRStorage && !param.transpose_a) { target_stype = hint_has_value ? target_stype : kCSRStorage; - // dns, csr -> csr on CPU - if (dev_mask == mshadow::cpu::kDevMask && !param.transpose_b) { - if (target_stype == kCSRStorage) { + if (dev_mask == mshadow::cpu::kDevMask) { + // dns, csr -> csr on CPU + if (target_stype == kCSRStorage && !param.transpose_b) { dispatched = storage_type_assign(&out_stype, kCSRStorage, dispatch_mode, DispatchMode::kFComputeEx); + // dns, csr/csr.T -> dns on CPU + } else if (target_stype == kDefaultStorage) { + dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, + DispatchMode::kFComputeEx); } // dns, csr/csr.T -> dns on GPU } else if (dev_mask == mshadow::gpu::kDevMask) { @@ -327,7 +331,7 @@ inline bool DotBackwardInferStorageType(const nnvm::NodeAttrs& attrs, dispatched = true; } } - if (!dispatched && dev_mask == mshadow::gpu::kDevMask && !param.transpose_a && + if (!dispatched && !param.transpose_a && lhs_stype == kDefaultStorage && rhs_stype == kCSRStorage && ograd_stype == kDefaultStorage) { if (type_assign(&lhs_grad_stype, kDefaultStorage) && @@ -655,7 +659,81 @@ struct DotDnsCsrCsrByRowBlocks { } }; +/*! + * \brief CPU Kernel of dot(dns1, csr) = dns2 + * Parallelization by row blocks + */ +struct DotDnsCsrDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const DType* data_r, + const IType* indptr_r, + const CType* col_idx_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_cols_l, + const nnvm::dim_t num_rows_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows_l) return; + const dim_t seg_end = std::min(seg_start + seg_len, num_rows_l); + for (dim_t j = 0; j < num_rows_r; ++j) { + if (indptr_r[j] == indptr_r[j+1]) continue; + for (IType k = indptr_r[j]; k < indptr_r[j+1]; ++k) { + const CType col_idx = col_idx_r[k]; + const DType val = data_r[k]; + for (dim_t r = seg_start; r < seg_end; ++r) { + out[r*num_cols_r+col_idx] += data_l[r*num_cols_l+j] * val; + } + } + } + } +}; +/*! + * \brief CPU Kernel of dot(dns1, csr.T) = dns2 + * Parallelization by row blocks + */ +struct DotDnsCsrTransDnsByRowBlocks { + /*! + * \brief + * \param i the i-th thread + */ + template + MSHADOW_CINLINE static void Map(int i, + DType* out, + const DType* data_l, + const DType* data_r, + const IType* indptr_r, + const CType* col_idx_r, + const nnvm::dim_t seg_len, + const nnvm::dim_t num_rows_l, + const nnvm::dim_t num_cols_l, + const nnvm::dim_t num_rows_r, + const nnvm::dim_t num_cols_r) { + using nnvm::dim_t; + const dim_t seg_start = i * seg_len; + if (seg_start >= num_rows_l) return; + const dim_t seg_end = std::min(seg_start + seg_len, num_rows_l); + for (dim_t j = 0; j < num_rows_r; ++j) { + if (indptr_r[j] == indptr_r[j+1]) continue; + for (IType k = indptr_r[j]; k < indptr_r[j+1]; ++k) { + const CType col_idx = col_idx_r[k]; + const DType val = data_r[k]; + for (dim_t r = seg_start; r < seg_end; ++r) { + out[r*num_rows_r+j] += data_l[r*num_cols_l+col_idx] * val; + } + } + } + } +}; /*! * \brief CPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2 @@ -1031,13 +1109,58 @@ inline void DotDnsCsrCsrImpl(const OpContext& ctx, const cpu& cpu_dev, } /* - * \brief Impl of dot(dns, csr) = dense (GPU only) + * \brief Impl of dot(dns, csr) = dns and dot(dns, csr.T) = dns */ inline void DotDnsCsrDnsImpl(const OpContext& ctx, const cpu& cpu_dev, - const TBlob& dns, const NDArray& rhs, - const OpReqType req, NDArray* ret, - const bool transpose_b) { - LOG(FATAL) << "dot(dense, csr) = dense is not implemented on CPU"; + const TBlob& dns, const NDArray& rhs, + const OpReqType req, NDArray* ret, + const bool transpose_b) { + if (req == kNullOp) return; + CHECK_EQ(rhs.storage_type(), kCSRStorage); + mshadow::Stream* s = ctx.get_stream(); + if (!rhs.storage_initialized()) { + FillZerosCsrImpl(s, *ret); + return; + } + + using nnvm::dim_t; + + const TBlob data_r = rhs.data(); + const TBlob indptr_r = rhs.aux_data(csr::kIndPtr); + const TBlob col_idx_r = rhs.aux_data(csr::kIdx); + const TBlob& data_l = dns; + const TBlob data_out = ret->data(); + + MSHADOW_SGL_DBL_TYPE_SWITCH(data_r.type_flag_, DType, { // data type + MSHADOW_IDX_TYPE_SWITCH(indptr_r.type_flag_, IType, { // indptr type + MSHADOW_IDX_TYPE_SWITCH(col_idx_r.type_flag_, CType, { // col idx type + dim_t num_threads; + if (req == kWriteTo) { + num_threads = data_out.Size(); + mxnet_op::Kernel::Launch( + s, num_threads, data_out.dptr()); + } + num_threads = mxnet_op::get_num_threads(data_out.shape_[0]); + // seg by output row + dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads; + if (transpose_b) { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), + data_r.dptr(), indptr_r.dptr(), + col_idx_r.dptr(), seg_len, + dns.shape_[0], dns.shape_[1], + rhs.shape()[0], rhs.shape()[1]); + } else { + mxnet_op::Kernel::Launch(s, num_threads, + data_out.dptr(), data_l.dptr(), + data_r.dptr(), indptr_r.dptr(), + col_idx_r.dptr(), seg_len, + dns.shape_[0], dns.shape_[1], + rhs.shape()[0], rhs.shape()[1]); + } + }); + }); + }); } inline bool DotShape(const nnvm::NodeAttrs& attrs, From 8f5330934b585ae0b475e86b10f1c5b69ebbbb2c Mon Sep 17 00:00:00 2001 From: XiaotaoChen Date: Sat, 2 Jun 2018 15:58:22 +0800 Subject: [PATCH 2/3] complete documentaion related to dot(dns, csr/csr.T)=dns on cpu --- src/operator/tensor/dot-inl.h | 24 ++++++++++++++++++++++-- src/operator/tensor/dot.cc | 4 ++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h index 55f7bc9bb1e2..0d3066f92f6c 100644 --- a/src/operator/tensor/dot-inl.h +++ b/src/operator/tensor/dot-inl.h @@ -666,7 +666,17 @@ struct DotDnsCsrCsrByRowBlocks { struct DotDnsCsrDnsByRowBlocks { /*! * \brief - * \param i the i-th thread + * \param i the i-th thread + * \param out output matrix + * \param data_l data of lhs + * \param data_r values of csr + * \param indptr_r row offsets of csr + * \param col_idx_r column indices of csr + * \param seg_len workload of this thread + * \param num_rows_l number of rows in lhs + * \param num_cols_l number of columns in lhs + * \param num_rows_r number of rows in rhs + * \param num_cols_r number of columns in rhs */ template MSHADOW_CINLINE static void Map(int i, @@ -704,7 +714,17 @@ struct DotDnsCsrDnsByRowBlocks { struct DotDnsCsrTransDnsByRowBlocks { /*! * \brief - * \param i the i-th thread + * \param i the i-th thread + * \param out output matrix + * \param data_l data of lhs + * \param data_r values of csr + * \param indptr_r row offsets of csr + * \param col_idx_r column indices of csr + * \param seg_len workload of this thread + * \param num_rows_l number of rows in lhs + * \param num_cols_l number of columns in lhs + * \param num_rows_r number of rows in rhs + * \param num_cols_r number of columns in rhs */ template MSHADOW_CINLINE static void Map(int i, diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc index 2f44f536397e..556fd1fea56d 100644 --- a/src/operator/tensor/dot.cc +++ b/src/operator/tensor/dot.cc @@ -60,8 +60,8 @@ forward_stype option for output storage type. Implemented sparse operations incl - dot(csr, default) = default - dot(csr, row_sparse) = default - dot(default, csr) = csr (CPU only) -- dot(default, csr, forward_stype='default') = default (GPU only) -- dot(default, csr, transpose_b=True, forward_stype='default') = default (GPU only) +- dot(default, csr, forward_stype='default') = default +- dot(default, csr, transpose_b=True, forward_stype='default') = default If the combination of input storage types and forward_stype does not match any of the above patterns, ``dot`` will fallback and generate output with default storage. From 84cc2308adadcac79d77f6f4bb7fc154167e6def Mon Sep 17 00:00:00 2001 From: XiaotaoChen Date: Sat, 9 Jun 2018 09:43:43 +0800 Subject: [PATCH 3/3] support fp16 by replacing MSHADOW_SGL_DBL_TYPE_SWITCH with MSHADOW_REAL_TYPE_SWITCH --- src/operator/tensor/dot-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h index 0d3066f92f6c..675cbe8b2382 100644 --- a/src/operator/tensor/dot-inl.h +++ b/src/operator/tensor/dot-inl.h @@ -1151,11 +1151,11 @@ inline void DotDnsCsrDnsImpl(const OpContext& ctx, const cpu& cpu_dev, const TBlob& data_l = dns; const TBlob data_out = ret->data(); - MSHADOW_SGL_DBL_TYPE_SWITCH(data_r.type_flag_, DType, { // data type + MSHADOW_REAL_TYPE_SWITCH(data_r.type_flag_, DType, { // data type MSHADOW_IDX_TYPE_SWITCH(indptr_r.type_flag_, IType, { // indptr type MSHADOW_IDX_TYPE_SWITCH(col_idx_r.type_flag_, CType, { // col idx type dim_t num_threads; - if (req == kWriteTo) { + if (req == kWriteTo || req == kWriteInplace) { num_threads = data_out.Size(); mxnet_op::Kernel::Launch( s, num_threads, data_out.dptr());