apache · cjolivier01 · Oct 23, 2017 · Oct 16, 2017 · Oct 16, 2017 · Oct 17, 2017
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
@@ -22,6 +22,7 @@
  * \brief Activation operator
  * \author Bing Xu
 */
+
 #ifndef MXNET_OPERATOR_ACTIVATION_INL_H_
 #define MXNET_OPERATOR_ACTIVATION_INL_H_
 
@@ -34,6 +35,7 @@
 #include <vector>
 #include <utility>
 #include "./operator_common.h"
+#include "./mxnet_op.h"
 
 namespace mxnet {
 namespace op {
@@ -75,9 +77,16 @@ class ActivationOp : public Operator {
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> data = in_data[activation::kData].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> out = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
-    Assign(out, req[activation::kOut], F<ForwardOp>(data));
+    const TBlob& input = in_data[activation::kData];
+    const size_t sz = input.shape_.Size();
+    if (sz) {
+      MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
+          s, sz,
+          out_data[activation::kOut].dptr<DType>(),
+          input.dptr<DType>());
+      });
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -93,14 +102,24 @@ class ActivationOp : public Operator {
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
     CHECK_EQ(req.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> m_out_grad = out_grad[activation::kOut].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> m_in_grad = in_grad[activation::kData].FlatTo2D<xpu, DType>(s);
-    Assign(m_in_grad, req[activation::kData], F<BackwardOp>(m_out_data) * m_out_grad);
+    const TBlob& m_out_grad = out_grad[activation::kOut];
+    const TBlob& m_out_data = out_data[activation::kOut];
+    const TBlob&  m_in_grad = in_grad[activation::kData];
+    const size_t sz = m_out_data.shape_.Size();
+    if (sz) {
+      MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<
+          mxnet::op::mxnet_op::backward_grad<BackwardOp>, Req>, xpu>::Launch(
+          s, sz,
+          m_in_grad.dptr<DType>(),
+          m_out_grad.dptr<DType>(),
+          m_out_data.dptr<DType>());
+      });
+    }
   }
 };  // class ActivationOp
 
-// Decalre Factory function, used for dispatch specialization
+// Declare Factory function, used for dispatch specialization
 template<typename xpu>
 Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape);
 

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
@@ -215,6 +215,20 @@ struct set_zero {
   }
 };
 
+/*! \brief Binary op backward gradient OP wrapper */
+template<typename GRAD_OP>
+struct backward_grad {
+  /* \brief Backward calc with grad
+   * \param a - output grad
+   * \param args... - data to grad calculation op (what this is -- input, output, etc. -- varies)
+   * \return input grad
+   */
+  template<typename DType, typename ...Args>
+  MSHADOW_XINLINE static DType Map(DType a, Args... args) {
+    return DType(a * GRAD_OP::Map(args...));
+  }
+};
+
 /*! \brief Select assignment operation based upon the req value
  * Also useful for mapping mshadow Compute (F<OP>) to Kernel<OP>::Launch
  */

diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
@@ -100,7 +100,8 @@ class BasicOperatorData {
 #endif
       , initializeForward_(0)   // unit testing may call inits in any order based
       , initializeBackward_(0)  // upon its use-case (ie may not want to run forward pass first)
-      , initializeCallback_(0) {
+      , initializeCallback_(0)
+      , generator_(new std::mt19937()) {
     opContext_.is_train = true;
     opContext_.run_ctx.stream = nullptr;
 
@@ -123,10 +124,14 @@ class BasicOperatorData {
       shape_input_vec_.resize(opProp.ListArguments().size());
       op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
       if (op_) {
+        const size_t output_count = opProp.ListOutputs().size();
+        const size_t aux_count = opProp.ListAuxiliaryStates().size();
         // Figure out what sort of blobs we need to allocate
         std::vector<TShape> out_shape, aux_shape;
+        out_shape.resize(output_count);
+        aux_shape.resize(aux_count);
         opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape);
-        std::vector<int> out_type, aux_type;
+        std::vector<int> out_type(output_count, -1), aux_type(aux_count, -1);
         opProp.InferType(in_type, &out_type, &aux_type);
 
         // Allocate top blobs (input)
@@ -174,9 +179,9 @@ class BasicOperatorData {
     initForward(opProp, in_type);
     if (!initializeBackward_++) {
       for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
-        CHECK_LT(x, c_.blob_input_vec_.size());
-        allocateBlob(&c_.blob_out_grad_, c_.blob_input_vec_[x].shape_,
-                     false, c_.blob_input_vec_[x].type_flag_);
+        CHECK_LT(x, c_.blob_output_vec_.size());
+        allocateBlob(&c_.blob_out_grad_, c_.blob_output_vec_[x].shape_,
+                     false, c_.blob_output_vec_[x].type_flag_);
       }
 
       for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
@@ -197,6 +202,7 @@ class BasicOperatorData {
 
   /*! \brief Run operator forward */
   void forward(const size_t count = 1) {
+    const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
     // Possibly move data to/from CPU and GPU (outside of timing scope)
     MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
                        new GPUOpData(c_, &opContext_) : nullptr));
@@ -206,15 +212,15 @@ class BasicOperatorData {
       for (size_t x = 0; x < count; ++x) {
         op()->Forward(opContext_,
                       c_.blob_input_vec_,
-                      {kWriteTo, kWriteTo, kWriteTo},
+                      req,
                       c_.blob_output_vec_,
                       c_.blob_aux_states_);
       }
     } else {
       for (size_t x = 0; x < count; ++x) {
         MXNET_CUDA_ONLY(op()->Forward(opContext_,
                                       gpuData->blob_input_vec_,
-                                      {kWriteTo, kWriteTo, kWriteTo},
+                                      req,
                                       gpuData->blob_output_vec_,
                                       gpuData->blob_aux_states_));
       }
@@ -223,6 +229,7 @@ class BasicOperatorData {
 
   /*! \brief Run operator backwards */
   void backward(const size_t count = 1) {
+    const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
     // Possibly move data to/from CPU and GPU (outside of timing scope)
     MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
                       new GPUOpData(c_, &opContext_) : nullptr));
@@ -234,7 +241,7 @@ class BasicOperatorData {
                        c_.blob_out_grad_,
                        c_.blob_input_vec_,
                        c_.blob_output_vec_,
-                       {kWriteTo, kWriteTo, kWriteTo},
+                       req,
                        c_.blob_in_grad_,
                        c_.blob_aux_states_);
       }
@@ -244,7 +251,7 @@ class BasicOperatorData {
                                        gpuData->blob_out_grad_,
                                        gpuData->blob_input_vec_,
                                        gpuData->blob_output_vec_,
-                                       {kWriteTo, kWriteTo, kWriteTo},
+                                       req,
                                        gpuData->blob_in_grad_,
                                        gpuData->blob_aux_states_));
       }
@@ -386,6 +393,21 @@ class BasicOperatorData {
     copy(blob, sourceData, 0, sourceDataSize);
   }
 
+  void FillRandom() {
+    std::uniform_real_distribution<DType> distribution(-1.0, 1.0);
+    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
+      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
+      if (data_vect) {
+        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
+          TBlob &blob = (*data_vect)[i];
+          test::patternFill<DType>(&blob, [this, &distribution]() -> DType {
+            return distribution(generator());
+          });
+        }
+      }
+    }
+  }
+
   /*! \brief Input and output blobs */
   OpContext                 opContext_;
 
@@ -520,6 +542,9 @@ class BasicOperatorData {
     return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
   }
 
+  /*! \brief mt19937 generator for random number generator */
+  std::mt19937& generator() { return *generator_; }
+
   /*! \brief Performance timing categories */
   enum TimingId {
     Forward,
@@ -539,6 +564,9 @@ class BasicOperatorData {
   /*! \brief scoped lifecycle management of allocated blobs */
   std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
 
+  /*! \brief Per-test generator */
+  std::unique_ptr<std::mt19937> generator_;
+
  public:
   /*! Timing instrumentation */
   test::perf::TimingInstrument timing_;
@@ -675,7 +703,7 @@ class Validator {
     }
     const TBlob& b1 = bv1[idx];
     const TBlob& b2 = bv2[idx];
-    if (print && test::debugOutput) {
+    if (print && test::debug_output) {
       test::print(RunContext(), &(std::cout << "Blob 1:"), b1, true, true);
       test::print(RunContext(), &(std::cout << "Blob 2:"), b2, true, true);
     }