From b8ac9c948a943d2ec8832539aee8fdee7026a80e Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Tue, 20 Feb 2018 02:29:36 +0000
Subject: [PATCH 01/18] refactor embed backward kernelcallker

---
 src/operator/tensor/indexing_op-inl.cuh | 134 +++++++++++++-----------
 1 file changed, 75 insertions(+), 59 deletions(-)
diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index 4458151f1782..81ad108a0747 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -38,7 +38,7 @@ namespace mxnet {
 namespace op {
 const int kWarpSize = 32;
 
-template<int SZ, typename DType, typename IdxType>
+template<int SZ, bool lookup, typename DType, typename IdxType>
 __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            // If idx_start == NULL, then in-kernel edge
                                            // detection is used
@@ -47,7 +47,9 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            const int* idx_start_size_ptr,
                                            const IdxType *sorted, const IdxType *index,
                                            const DType *src,
-                                           int ymax, int xmax) {
+                                           int ymax, int xmax,
+                                           // table to look up positions of row_ids in dst
+                                           const nnvm::dim_t *lookup_table) {
   // Size of the shared memory is [blockDim.x*SZ*blockDim.y]*sizeof(DType)
   extern __shared__ char sh_grad_weight_char[];
   DType* sh_grad_weight = (DType*)sh_grad_weight_char;
@@ -125,7 +127,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
     }
 
     const int start_feature = threadIdx.x + blockIdx.x * blockDim.x * SZ;
-    const int dst_row = sorted_value * xmax;
+    const int dst_row = (lookup ? lookup_table[sorted_value] : sorted_value) * xmax;
 
     int num_idx = idx_end - idx_begin;
     int idx0 = idx_begin + threadIdx.y*num_idx/blockDim.y;
@@ -199,6 +201,73 @@ AddTakeGradLargeBatchWorkspaceSize(size_t num_keys) {
   return (unique_bytes + counts_bytes + num_runs_bytes + temporary_bytes);
 }
 
+template<bool lookup, typename IndexType, typename DType>
+inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst,
+                                              const mshadow::Tensor<gpu, 1, IndexType>& sorted,
+                                              const mshadow::Tensor<gpu, 1, IndexType>& index,
+                                              const mshadow::Tensor<gpu, 2, DType> &src,
+                                              IndexType* sum_counts_ptr,
+                                              int* num_runs_ptr,
+                                              const nnvm::dim_t* lookup_table) {
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
+  const int num_unique_est = min(dst.size(0), src.size(0));
+  const int max_nthread = 128;
+  const int num_y = max(src.size(0)/num_unique_est, 1);
+  const int block_dim_x = kWarpSize;
+  const int block_dim_y = min(num_y, max_nthread/block_dim_x);
+  const int SZ = min((src.size(1) + block_dim_x - 1) / block_dim_x, 4);
+  const int grid_dim_x = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
+  const int grid_dim_y = min(num_unique_est, mshadow::cuda::kBaseGridNum);
+  dim3 dimBlock(block_dim_x, block_dim_y);
+  dim3 dimGrid(grid_dim_x, grid_dim_y);
+  // Maximum shared memory usage: 128*4*sizeof(DType), which is 4K for 64bit DType elements
+  int shmem_size = dimBlock.x*SZ*dimBlock.y*sizeof(DType);
+
+  CHECK_EQ(dst.size(1), src.size(1)) << "AddTakeGradLargeBatch: shape mismatch";
+  CHECK_EQ(index.size(0), src.size(0)) << "AddTakeGradLargeBatch: shape mismatch";
+  mshadow::cuda::CheckLaunchParam(dimGrid, dimBlock, "AddTakeGradLargeBatch");
+
+  switch (SZ) {
+    case 1:
+    AddTakeGradLargeBatchKernel<1, false, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    case 2:
+    AddTakeGradLargeBatchKernel<2, false, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    case 3:
+    AddTakeGradLargeBatchKernel<3, false, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    case 4:
+    AddTakeGradLargeBatchKernel<4, false, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    default:
+    LOG(FATAL) << "AddTakeGradLargeBatch, incorrect value SZ " << SZ;
+    break;
+  }
+  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
+}
+
+
 template<typename IndexType, typename DType>
 inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
                                   const mshadow::Tensor<gpu, 1, IndexType>& sorted,
@@ -249,62 +318,9 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
     (temporary_storage, temporary_bytes, counts_out_ptr, sum_counts_ptr,
       sorted.size(0), stream);
   }
-
-  const int num_unique_est = min(dst.size(0), src.size(0));
-  const int max_nthread = 128;
-  const int num_y = max(src.size(0)/num_unique_est, 1);
-  const int block_dim_x = kWarpSize;
-  const int block_dim_y = min(num_y, max_nthread/block_dim_x);
-  const int SZ = min((src.size(1) + block_dim_x - 1) / block_dim_x, 4);
-  const int grid_dim_x = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
-  const int grid_dim_y = min(num_unique_est, mshadow::cuda::kBaseGridNum);
-  dim3 dimBlock(block_dim_x, block_dim_y);
-  dim3 dimGrid(grid_dim_x, grid_dim_y);
-  // Maximum shared memory usage: 128*4*sizeof(DType), which is 4K for 64bit DType elements
-  int shmem_size = dimBlock.x*SZ*dimBlock.y*sizeof(DType);
-
-  CHECK_EQ(dst.size(1), src.size(1)) << "AddTakeGradLargeBatch: shape mismatch";
-  CHECK_EQ(index.size(0), src.size(0)) << "AddTakeGradLargeBatch: shape mismatch";
-  mshadow::cuda::CheckLaunchParam(dimGrid, dimBlock, "AddTakeGradLargeBatch");
-
-  switch (SZ) {
-    case 1:
-    AddTakeGradLargeBatchKernel<1, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)));
-    break;
-    case 2:
-    AddTakeGradLargeBatchKernel<2, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)));
-    break;
-    case 3:
-    AddTakeGradLargeBatchKernel<3, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)));
-    break;
-    case 4:
-    AddTakeGradLargeBatchKernel<4, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)));
-    break;
-    default:
-    LOG(FATAL) << "AddTakeGradLargeBatch, incorrect value SZ " << SZ;
-    break;
-  }
-  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
+  nnvm::dim_t* lookup_table = nullptr;
+  AddTakeGradLargeBatchKernelLaunch<false>(dst, sorted, index, src, sum_counts_ptr,
+                                           num_runs_ptr, lookup_table);
 }
 
 }  // namespace op

From 131bf3243cd13ebad1dc73d8e3d10ab6d6c97567 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Tue, 20 Feb 2018 15:21:07 +0000
Subject: [PATCH 02/18] pass unit test

---
 src/operator/tensor/indexing_op-inl.cuh       |   3 +-
 src/operator/tensor/indexing_op.cu            | 220 ++++++++++++++++++
 tests/python/unittest/test_sparse_operator.py |   4 +-
 3 files changed, 224 insertions(+), 3 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index 81ad108a0747..ed4589594444 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -127,7 +127,8 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
     }
 
     const int start_feature = threadIdx.x + blockIdx.x * blockDim.x * SZ;
-    const int dst_row = (lookup ? lookup_table[sorted_value] : sorted_value) * xmax;
+    // TODO remove -1
+    const int dst_row = (lookup ? (lookup_table[sorted_value]-1) : sorted_value) * xmax;
 
     int num_idx = idx_end - idx_begin;
     int idx0 = idx_begin + threadIdx.y*num_idx/blockDim.y;
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 762d8fd64c2b..c00b22f514fd 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -26,6 +26,7 @@
 
 #include "./indexing_op.h"
 #include "./util/tensor_util-inl.cuh"
+#include <chrono>
 
 namespace mxnet {
 namespace op {
@@ -103,6 +104,216 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
   }
 }
 
+inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
+                                              const TBlob& ograd,
+                                              const TBlob& data,
+                                              const OpReqType req,
+                                              const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  dim_t num_threads;
+  if (data_size == 0) {
+    FillZerosRspImpl(s, output);
+    return;
+  }
+   s->Wait();
+  auto t0 = std::chrono::duration_cast<std::chrono::microseconds>(
+    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        dim_t* prefix_sum = NULL;
+        void* temp_storage = NULL;
+        dim_t* sorted_data = NULL;
+        dim_t* original_idx = NULL;
+        // calculate resource bytes
+        size_t row_flg_storage_bytes = num_rows * sizeof(dim_t);
+        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
+        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
+        size_t sum_workspace_bytes = 0;
+        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
+        cub::DeviceScan::InclusiveSum(temp_storage,
+                                      sum_workspace_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      Stream<gpu>::GetStream(s));
+        // temp_workspace is shared by inclusive sum and sort
+        size_t temp_workspace_bytes = std::max(sum_workspace_bytes, sort_workspace_size);
+        size_t total_storage_bytes = row_flg_storage_bytes + sorted_data_storage_bytes +
+                                     original_idx_storage_bytes + temp_workspace_bytes;
+
+        // request resource and split it. layout =
+        // row_flg/prefixsum, sorted_data, original_idx, temp_storage
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
+        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
+        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + row_flg_storage_bytes);
+        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + row_flg_storage_bytes +
+                                                sorted_data_storage_bytes);
+        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
+        // compute row flags and prefix sum
+        num_threads = num_rows;
+   s->Wait();
+  auto t1 = std::chrono::duration_cast<std::chrono::microseconds>(
+    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+        Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
+        Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
+        cub::DeviceScan::InclusiveSum(temp_storage,
+                                      temp_workspace_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      mshadow::Stream<gpu>::GetStream(s));
+        // retrieve nnr and allocate output
+   s->Wait();
+  auto t2 = std::chrono::duration_cast<std::chrono::microseconds>(
+    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
+            cudaMemcpyDeviceToHost));
+        output.CheckAndAlloc({Shape1(nnr)});
+        // fill row_idx array of output matrix, using the row_flg values
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
+            grad_row_idx, prefix_sum, num_rows);
+
+        // make a copy of the data, to be sorted
+        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
+        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
+        mxnet_op::copy(s, sorted_data_blob, data);
+
+        // generate original idx
+        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
+        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
+                                       kWriteTo, original_idx);
+        // sort data with its original idx
+        int num_bits = ilog2(num_rows - 1);
+        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
+        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
+                                                 Shape1(sort_workspace_size), s);
+   s->Wait();
+  auto t3 = std::chrono::duration_cast<std::chrono::microseconds>(
+    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+        SortByKey(sorted_data_tensor, original_idx_tensor, true,
+                  &temp_storage_tensor, 0, num_bits);
+   s->Wait();
+  auto t4 = std::chrono::duration_cast<std::chrono::microseconds>(
+    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+
+        // accumulate gradients
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+        //const int SZ = 4;
+        //int num_threads_per_data = (row_length + SZ - 1) / SZ;
+
+        //Kernel<AddTakeGradRspGPUKernel2, gpu>::Launch(s, data_size * num_threads_per_data,
+        //       grad_data, prefix_sum, sorted_data, data_size, original_idx,
+        //       ograd.dptr<DType>(), row_length, num_threads_per_data, SZ);
+
+{
+  //using IndexType = IType;
+  dim_t* sum_counts_ptr = NULL;
+  int* num_runs_ptr = NULL;
+  mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
+  mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
+  mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
+  auto arrshape = ograd.shape_;
+  mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
+          Shape2(arrshape[0], arrshape.ProdShape(1, arrshape.ndim())), s);
+
+  const int num_unique_est = min(dst.size(0), src.size(0));
+  const int max_nthread = 128;
+  const int num_y = max(src.size(0)/num_unique_est, 1);
+  const int block_dim_x = kWarpSize;
+  const int block_dim_y = min(num_y, max_nthread/block_dim_x);
+  const int SZ = min((src.size(1) + block_dim_x - 1) / block_dim_x, 4);
+  const int grid_dim_x = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
+  const int grid_dim_y = min(num_unique_est, mshadow::cuda::kBaseGridNum);
+  dim3 dimBlock(block_dim_x, block_dim_y);
+  dim3 dimGrid(grid_dim_x, grid_dim_y);
+  // Maximum shared memory usage: 128*4*sizeof(DType), which is 4K for 64bit DType elements
+  int shmem_size = dimBlock.x*SZ*dimBlock.y*sizeof(DType);
+
+  CHECK_EQ(dst.size(1), src.size(1)) << "AddTakeGradLargeBatch: shape mismatch";
+  CHECK_EQ(index.size(0), src.size(0)) << "AddTakeGradLargeBatch: shape mismatch";
+  mshadow::cuda::CheckLaunchParam(dimGrid, dimBlock, "AddTakeGradLargeBatch");
+  nnvm::dim_t* lookup_table = prefix_sum;
+
+  switch (SZ) {
+    case 1:
+    AddTakeGradLargeBatchKernel<1, true, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    case 2:
+    AddTakeGradLargeBatchKernel<2, true, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    case 3:
+    AddTakeGradLargeBatchKernel<3, true, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    case 4:
+    AddTakeGradLargeBatchKernel<4, true, DType>
+        <<<dimGrid, dimBlock, shmem_size, stream>>>
+        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
+         sorted.dptr_, index.dptr_, src.dptr_,
+         static_cast<int>(src.size(0)),
+         static_cast<int>(src.size(1)), lookup_table);
+    break;
+    default:
+    LOG(FATAL) << "AddTakeGradLargeBatch, incorrect value SZ " << SZ;
+    break;
+  }
+  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
+
+
+}
+
+
+   s->Wait();
+  auto t5 = std::chrono::duration_cast<std::chrono::microseconds>(
+    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+bool log = dmlc::GetEnv("LOG_TIME", true);
+if (log) {
+LOG(INFO) << t1-t0 << "\t"
+          << t2-t1 << "\t"
+          << t3-t2 << "\t"
+          << t4-t3 << "\t"
+          << t5-t4 << "\t"
+          << t5-t0 << "\t";
+}
+      });
+    });
+  });
+}
 
 template<>
 inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
@@ -110,6 +321,15 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
                                                   const TBlob& data,
                                                   const OpReqType req,
                                                   const NDArray& output) {
+  
+  int deterministic = dmlc::GetEnv("MXNET_DETERM", 0);
+  if (deterministic == 1) {
+    LOG(FATAL) << "NOT REAHCED";
+    return;
+  } else if (deterministic == 2) {
+    SparseEmbeddingOpBackwardRspImpl3(ctx, ograd, data, req, output);
+    return;
+  }
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace mshadow::expr;
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 84dfc5878c20..b9447626d081 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1615,7 +1615,7 @@ def check_sparse_elementwise_sum_with_shape(stype, shape, n):
 
 
 def test_sparse_embedding():
-    ''' test sparse embedding op on cpu '''
+    ''' test sparse embedding operator '''
     def check_sparse_embedding(executor, weight_ref, data_onehot, grad, density):
         # update weight based on density
         weight[:] = rand_ndarray(weight.shape, 'row_sparse', density=density)
@@ -1646,7 +1646,7 @@ def check_sparse_embedding(executor, weight_ref, data_onehot, grad, density):
     arg_map["data"][:] = np_data
     # init grad
     np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
-    grad = mx.nd.sparse.zeros('row_sparse', np_grad.shape)
+    grad = mx.nd.zeros(np_grad.shape)
     grad[:] = np_grad
     # weight
     weight = arg_map["embed_weight"]

From 04f353ee2f1cd054439f2f772a42a7e5ba12d869 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Feb 2018 01:54:54 +0000
Subject: [PATCH 03/18] refactor

---
 src/operator/tensor/indexing_op-inl.cuh |  8 +--
 src/operator/tensor/indexing_op.cu      | 68 +------------------------
 2 files changed, 6 insertions(+), 70 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index ed4589594444..10e665016b6c 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -230,7 +230,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
 
   switch (SZ) {
     case 1:
-    AddTakeGradLargeBatchKernel<1, false, DType>
+    AddTakeGradLargeBatchKernel<1, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -238,7 +238,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 2:
-    AddTakeGradLargeBatchKernel<2, false, DType>
+    AddTakeGradLargeBatchKernel<2, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -246,7 +246,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 3:
-    AddTakeGradLargeBatchKernel<3, false, DType>
+    AddTakeGradLargeBatchKernel<3, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -254,7 +254,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 4:
-    AddTakeGradLargeBatchKernel<4, false, DType>
+    AddTakeGradLargeBatchKernel<4, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index c00b22f514fd..e42418808d7b 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -218,86 +218,22 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
         DType* grad_data = output.data().dptr<DType>();
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
             kWriteTo, 0);
-        //const int SZ = 4;
-        //int num_threads_per_data = (row_length + SZ - 1) / SZ;
-
-        //Kernel<AddTakeGradRspGPUKernel2, gpu>::Launch(s, data_size * num_threads_per_data,
-        //       grad_data, prefix_sum, sorted_data, data_size, original_idx,
-        //       ograd.dptr<DType>(), row_length, num_threads_per_data, SZ);
 
 {
   //using IndexType = IType;
   dim_t* sum_counts_ptr = NULL;
   int* num_runs_ptr = NULL;
   mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
   mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
   mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
   auto arrshape = ograd.shape_;
   mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
           Shape2(arrshape[0], arrshape.ProdShape(1, arrshape.ndim())), s);
 
-  const int num_unique_est = min(dst.size(0), src.size(0));
-  const int max_nthread = 128;
-  const int num_y = max(src.size(0)/num_unique_est, 1);
-  const int block_dim_x = kWarpSize;
-  const int block_dim_y = min(num_y, max_nthread/block_dim_x);
-  const int SZ = min((src.size(1) + block_dim_x - 1) / block_dim_x, 4);
-  const int grid_dim_x = (src.size(1) + block_dim_x * SZ - 1) / (block_dim_x * SZ);
-  const int grid_dim_y = min(num_unique_est, mshadow::cuda::kBaseGridNum);
-  dim3 dimBlock(block_dim_x, block_dim_y);
-  dim3 dimGrid(grid_dim_x, grid_dim_y);
-  // Maximum shared memory usage: 128*4*sizeof(DType), which is 4K for 64bit DType elements
-  int shmem_size = dimBlock.x*SZ*dimBlock.y*sizeof(DType);
-
-  CHECK_EQ(dst.size(1), src.size(1)) << "AddTakeGradLargeBatch: shape mismatch";
-  CHECK_EQ(index.size(0), src.size(0)) << "AddTakeGradLargeBatch: shape mismatch";
-  mshadow::cuda::CheckLaunchParam(dimGrid, dimBlock, "AddTakeGradLargeBatch");
   nnvm::dim_t* lookup_table = prefix_sum;
-
-  switch (SZ) {
-    case 1:
-    AddTakeGradLargeBatchKernel<1, true, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
-    break;
-    case 2:
-    AddTakeGradLargeBatchKernel<2, true, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
-    break;
-    case 3:
-    AddTakeGradLargeBatchKernel<3, true, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
-    break;
-    case 4:
-    AddTakeGradLargeBatchKernel<4, true, DType>
-        <<<dimGrid, dimBlock, shmem_size, stream>>>
-        (dst.dptr_, sum_counts_ptr, num_runs_ptr,
-         sorted.dptr_, index.dptr_, src.dptr_,
-         static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
-    break;
-    default:
-    LOG(FATAL) << "AddTakeGradLargeBatch, incorrect value SZ " << SZ;
-    break;
-  }
-  MSHADOW_CUDA_POST_KERNEL_CHECK(AddTakeGradLargeBatchKernel);
-
-
+  AddTakeGradLargeBatchKernelLaunch<true>(dst, sorted, index, src, sum_counts_ptr,
+                                           num_runs_ptr, lookup_table);
 }
-
-
    s->Wait();
   auto t5 = std::chrono::duration_cast<std::chrono::microseconds>(
     std::chrono::high_resolution_clock::now().time_since_epoch()).count();

From d9fc5f62bfae6dbad62572bebb9933c8f64f4219 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Feb 2018 02:23:54 +0000
Subject: [PATCH 04/18] fix dim bug

---
 src/operator/tensor/indexing_op.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index e42418808d7b..f9dd8629104b 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -220,16 +220,14 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
             kWriteTo, 0);
 
 {
-  //using IndexType = IType;
   dim_t* sum_counts_ptr = NULL;
   int* num_runs_ptr = NULL;
   mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
   mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
   mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-  auto arrshape = ograd.shape_;
+  const auto oshape = ograd.shape_;
   mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-          Shape2(arrshape[0], arrshape.ProdShape(1, arrshape.ndim())), s);
-
+      Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
   nnvm::dim_t* lookup_table = prefix_sum;
   AddTakeGradLargeBatchKernelLaunch<true>(dst, sorted, index, src, sum_counts_ptr,
                                            num_runs_ptr, lookup_table);

From 021d66e238ebcbcb38315f68d50e047276e1925b Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Feb 2018 08:18:39 +0000
Subject: [PATCH 05/18] add unique impl

---
 src/operator/tensor/indexing_op-inl.cuh |  17 +--
 src/operator/tensor/indexing_op.cu      | 161 +++++++++++++++++++-----
 2 files changed, 137 insertions(+), 41 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index 10e665016b6c..b65de06afa91 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -38,7 +38,7 @@ namespace mxnet {
 namespace op {
 const int kWarpSize = 32;
 
-template<int SZ, bool lookup, typename DType, typename IdxType>
+template<int SZ, bool lookup, bool hb_offset, typename DType, typename IdxType>
 __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            // If idx_start == NULL, then in-kernel edge
                                            // detection is used
@@ -128,7 +128,8 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
 
     const int start_feature = threadIdx.x + blockIdx.x * blockDim.x * SZ;
     // TODO remove -1
-    const int dst_row = (lookup ? (lookup_table[sorted_value]-1) : sorted_value) * xmax;
+    const int extra_off = hb_offset ? -1 : 0;
+    const int dst_row = (lookup ? (lookup_table[sorted_value]+extra_off) : sorted_value) * xmax;
 
     int num_idx = idx_end - idx_begin;
     int idx0 = idx_begin + threadIdx.y*num_idx/blockDim.y;
@@ -202,7 +203,7 @@ AddTakeGradLargeBatchWorkspaceSize(size_t num_keys) {
   return (unique_bytes + counts_bytes + num_runs_bytes + temporary_bytes);
 }
 
-template<bool lookup, typename IndexType, typename DType>
+template<bool lookup, bool hb_offset, typename IndexType, typename DType>
 inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst,
                                               const mshadow::Tensor<gpu, 1, IndexType>& sorted,
                                               const mshadow::Tensor<gpu, 1, IndexType>& index,
@@ -230,7 +231,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
 
   switch (SZ) {
     case 1:
-    AddTakeGradLargeBatchKernel<1, lookup, DType>
+    AddTakeGradLargeBatchKernel<1, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -238,7 +239,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 2:
-    AddTakeGradLargeBatchKernel<2, lookup, DType>
+    AddTakeGradLargeBatchKernel<2, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -246,7 +247,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 3:
-    AddTakeGradLargeBatchKernel<3, lookup, DType>
+    AddTakeGradLargeBatchKernel<3, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -254,7 +255,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 4:
-    AddTakeGradLargeBatchKernel<4, lookup, DType>
+    AddTakeGradLargeBatchKernel<4, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -320,7 +321,7 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
       sorted.size(0), stream);
   }
   nnvm::dim_t* lookup_table = nullptr;
-  AddTakeGradLargeBatchKernelLaunch<false>(dst, sorted, index, src, sum_counts_ptr,
+  AddTakeGradLargeBatchKernelLaunch<false, false>(dst, sorted, index, src, sum_counts_ptr,
                                            num_runs_ptr, lookup_table);
 }
 
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index f9dd8629104b..b8c4ecc95cb8 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -26,7 +26,6 @@
 
 #include "./indexing_op.h"
 #include "./util/tensor_util-inl.cuh"
-#include <chrono>
 
 namespace mxnet {
 namespace op {
@@ -104,6 +103,129 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
   }
 }
 
+
+struct mark_lookup_table {
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* out, const DType* data) {
+    out[static_cast<nnvm::dim_t>(data[i])] = i;
+  }
+};
+
+inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
+                                              const TBlob& ograd,
+                                              const TBlob& data,
+                                              const OpReqType req,
+                                              const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  if (data_size == 0) {
+    FillZerosRspImpl(s, output);
+    return;
+  }
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // temp resource declarations
+        dim_t* lookup_table = NULL;
+        void* temp_storage = NULL;
+        dim_t* sorted_data = NULL;
+        dim_t* original_idx = NULL;
+        // calculate number of bytes for temp resources
+        size_t lookup_table_bytes = num_rows * sizeof(dim_t);
+        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
+        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
+        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
+        size_t unique_workspace_bytes = 0;
+        // estimate unique temp space
+        IType* data_ptr = data.dptr<IType>();
+        size_t *null_ptr = nullptr;
+        cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
+          null_ptr, data_size, Stream<gpu>::GetStream(s));
+        // One more space reserved for unique count
+        size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
+                                               sort_workspace_size);
+        size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
+                                     original_idx_storage_bytes + temp_workspace_bytes;
+
+        // request resource and split it. layout is:
+        // lookup_table, sorted_data, original_idx, temp_storage
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
+        lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
+        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
+        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
+                                                sorted_data_storage_bytes);
+        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
+
+        // make a copy of the data, to be sorted
+        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
+        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
+        mxnet_op::copy(s, sorted_data_blob, data);
+
+        // generate original idx
+        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
+        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
+                                       kWriteTo, original_idx);
+        // sort data with its original idx
+        int num_bits = ilog2(num_rows - 1);
+        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
+        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
+                                                 Shape1(sort_workspace_size), s);
+        SortByKey(sorted_data_tensor, original_idx_tensor, true,
+                  &temp_storage_tensor, 0, num_bits);
+
+        // compute unique row ids based on sorted values.
+        output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
+
+        // fill row_idx array of output matrix, using the row_flg values
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // compute unique row ids 
+        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
+            grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
+
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
+            cudaMemcpyDeviceToHost));
+        CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
+        output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
+        output.set_aux_shape(rowsparse::kIdx, Shape1(nnr));
+
+        // generate lookup table
+        Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
+
+        // accumulate gradients
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+{
+        dim_t* sum_counts_ptr = NULL;
+        int* num_runs_ptr = NULL;
+        mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
+        mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
+        mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
+        const auto oshape = ograd.shape_;
+        mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
+            Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+        AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
+                                                 num_runs_ptr, lookup_table);
+}
+      });
+    });
+  });
+}
+
 inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
                                               const TBlob& ograd,
                                               const TBlob& data,
@@ -128,10 +250,6 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
     FillZerosRspImpl(s, output);
     return;
   }
-   s->Wait();
-  auto t0 = std::chrono::duration_cast<std::chrono::microseconds>(
-    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
-
   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
@@ -167,9 +285,6 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
         temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
         // compute row flags and prefix sum
         num_threads = num_rows;
-   s->Wait();
-  auto t1 = std::chrono::duration_cast<std::chrono::microseconds>(
-    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
         Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
         Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
         cub::DeviceScan::InclusiveSum(temp_storage,
@@ -179,9 +294,6 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
                                       num_rows,
                                       mshadow::Stream<gpu>::GetStream(s));
         // retrieve nnr and allocate output
-   s->Wait();
-  auto t2 = std::chrono::duration_cast<std::chrono::microseconds>(
-    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
         dim_t nnr = 0;
         CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
             cudaMemcpyDeviceToHost));
@@ -205,15 +317,8 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
         char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
         Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
                                                  Shape1(sort_workspace_size), s);
-   s->Wait();
-  auto t3 = std::chrono::duration_cast<std::chrono::microseconds>(
-    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
         SortByKey(sorted_data_tensor, original_idx_tensor, true,
                   &temp_storage_tensor, 0, num_bits);
-   s->Wait();
-  auto t4 = std::chrono::duration_cast<std::chrono::microseconds>(
-    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
-
         // accumulate gradients
         DType* grad_data = output.data().dptr<DType>();
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
@@ -229,20 +334,8 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
   mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
       Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
   nnvm::dim_t* lookup_table = prefix_sum;
-  AddTakeGradLargeBatchKernelLaunch<true>(dst, sorted, index, src, sum_counts_ptr,
+  AddTakeGradLargeBatchKernelLaunch<true, true>(dst, sorted, index, src, sum_counts_ptr,
                                            num_runs_ptr, lookup_table);
-}
-   s->Wait();
-  auto t5 = std::chrono::duration_cast<std::chrono::microseconds>(
-    std::chrono::high_resolution_clock::now().time_since_epoch()).count();
-bool log = dmlc::GetEnv("LOG_TIME", true);
-if (log) {
-LOG(INFO) << t1-t0 << "\t"
-          << t2-t1 << "\t"
-          << t3-t2 << "\t"
-          << t4-t3 << "\t"
-          << t5-t4 << "\t"
-          << t5-t0 << "\t";
 }
       });
     });
@@ -258,11 +351,14 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
   
   int deterministic = dmlc::GetEnv("MXNET_DETERM", 0);
   if (deterministic == 1) {
-    LOG(FATAL) << "NOT REAHCED";
+    LOG(FATAL) << "NOT REACHED";
     return;
   } else if (deterministic == 2) {
     SparseEmbeddingOpBackwardRspImpl3(ctx, ograd, data, req, output);
     return;
+  } else if (deterministic == 3) {
+    SparseEmbeddingOpBackwardRspImpl4(ctx, ograd, data, req, output);
+    return;
   }
   using namespace mshadow;
   using namespace mxnet_op;
@@ -310,7 +406,6 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
         dim_t nnr = 0;
         CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
             cudaMemcpyDeviceToHost));
-
         if (nnr == 0) {
           FillZerosRspImpl(s, output);
           return;

From 5d1cd64840676f25f50b3ac5ced849867af3112a Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Feb 2018 08:48:07 +0000
Subject: [PATCH 06/18] remove old op

---
 src/operator/tensor/indexing_op-inl.cuh |  20 +-
 src/operator/tensor/indexing_op.cu      | 245 ++----------------------
 2 files changed, 28 insertions(+), 237 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index b65de06afa91..4df1fd451ec5 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -38,7 +38,7 @@ namespace mxnet {
 namespace op {
 const int kWarpSize = 32;
 
-template<int SZ, bool lookup, bool hb_offset, typename DType, typename IdxType>
+template<int SZ, bool lookup, typename DType, typename IdxType>
 __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            // If idx_start == NULL, then in-kernel edge
                                            // detection is used
@@ -127,9 +127,8 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
     }
 
     const int start_feature = threadIdx.x + blockIdx.x * blockDim.x * SZ;
-    // TODO remove -1
-    const int extra_off = hb_offset ? -1 : 0;
-    const int dst_row = (lookup ? (lookup_table[sorted_value]+extra_off) : sorted_value) * xmax;
+    // Lookup inclusive prefix sum table if necessary
+    const int dst_row = (lookup ? (lookup_table[sorted_value] - 1) : sorted_value) * xmax;
 
     int num_idx = idx_end - idx_begin;
     int idx0 = idx_begin + threadIdx.y*num_idx/blockDim.y;
@@ -183,7 +182,6 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
         }
       }
     }
-
   }
 }
 
@@ -203,7 +201,7 @@ AddTakeGradLargeBatchWorkspaceSize(size_t num_keys) {
   return (unique_bytes + counts_bytes + num_runs_bytes + temporary_bytes);
 }
 
-template<bool lookup, bool hb_offset, typename IndexType, typename DType>
+template<bool lookup, typename IndexType, typename DType>
 inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst,
                                               const mshadow::Tensor<gpu, 1, IndexType>& sorted,
                                               const mshadow::Tensor<gpu, 1, IndexType>& index,
@@ -231,7 +229,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
 
   switch (SZ) {
     case 1:
-    AddTakeGradLargeBatchKernel<1, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<1, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -239,7 +237,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 2:
-    AddTakeGradLargeBatchKernel<2, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<2, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -247,7 +245,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 3:
-    AddTakeGradLargeBatchKernel<3, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<3, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -255,7 +253,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 4:
-    AddTakeGradLargeBatchKernel<4, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<4, lookup, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -321,7 +319,7 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
       sorted.size(0), stream);
   }
   nnvm::dim_t* lookup_table = nullptr;
-  AddTakeGradLargeBatchKernelLaunch<false, false>(dst, sorted, index, src, sum_counts_ptr,
+  AddTakeGradLargeBatchKernelLaunch<false>(dst, sorted, index, src, sum_counts_ptr,
                                            num_runs_ptr, lookup_table);
 }
 
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index b8c4ecc95cb8..4f1a52562121 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -103,134 +103,12 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
   }
 }
 
-
-struct mark_lookup_table {
-  template<typename IType, typename DType>
-  MSHADOW_XINLINE static void Map(int i, IType* out, const DType* data) {
-    out[static_cast<nnvm::dim_t>(data[i])] = i;
-  }
-};
-
-inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
-                                              const TBlob& ograd,
-                                              const TBlob& data,
-                                              const OpReqType req,
-                                              const NDArray& output) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace expr;
-  using namespace rowsparse;
-  using nnvm::dim_t;
-  if (req == kNullOp) return;
-  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
-                          << "weight gradient calculation with req != write";
-
-  Stream<gpu> *s = ctx.get_stream<gpu>();
-  dim_t num_rows = output.shape()[0];
-  dim_t row_length = output.shape()[1];
-  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
-  if (data_size == 0) {
-    FillZerosRspImpl(s, output);
-    return;
-  }
-
-  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
-      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
-        // temp resource declarations
-        dim_t* lookup_table = NULL;
-        void* temp_storage = NULL;
-        dim_t* sorted_data = NULL;
-        dim_t* original_idx = NULL;
-        // calculate number of bytes for temp resources
-        size_t lookup_table_bytes = num_rows * sizeof(dim_t);
-        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
-        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
-        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
-        size_t unique_workspace_bytes = 0;
-        // estimate unique temp space
-        IType* data_ptr = data.dptr<IType>();
-        size_t *null_ptr = nullptr;
-        cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
-          null_ptr, data_size, Stream<gpu>::GetStream(s));
-        // One more space reserved for unique count
-        size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
-                                               sort_workspace_size);
-        size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
-                                     original_idx_storage_bytes + temp_workspace_bytes;
-
-        // request resource and split it. layout is:
-        // lookup_table, sorted_data, original_idx, temp_storage
-        Tensor<gpu, 1, char> workspace = ctx.requested[0]
-            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
-        lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
-        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
-        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
-                                                sorted_data_storage_bytes);
-        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
-
-        // make a copy of the data, to be sorted
-        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
-        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
-        mxnet_op::copy(s, sorted_data_blob, data);
-
-        // generate original idx
-        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
-        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
-                                       kWriteTo, original_idx);
-        // sort data with its original idx
-        int num_bits = ilog2(num_rows - 1);
-        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
-        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
-                                                 Shape1(sort_workspace_size), s);
-        SortByKey(sorted_data_tensor, original_idx_tensor, true,
-                  &temp_storage_tensor, 0, num_bits);
-
-        // compute unique row ids based on sorted values.
-        output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
-
-        // fill row_idx array of output matrix, using the row_flg values
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        // compute unique row ids 
-        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
-            grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
-
-        dim_t nnr = 0;
-        CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
-            cudaMemcpyDeviceToHost));
-        CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
-        output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
-        output.set_aux_shape(rowsparse::kIdx, Shape1(nnr));
-
-        // generate lookup table
-        Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
-
-        // accumulate gradients
-        DType* grad_data = output.data().dptr<DType>();
-        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
-            kWriteTo, 0);
-{
-        dim_t* sum_counts_ptr = NULL;
-        int* num_runs_ptr = NULL;
-        mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-        mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
-        mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-        const auto oshape = ograd.shape_;
-        mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-            Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-        AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
-                                                 num_runs_ptr, lookup_table);
-}
-      });
-    });
-  });
-}
-
-inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
-                                              const TBlob& ograd,
-                                              const TBlob& data,
-                                              const OpReqType req,
-                                              const NDArray& output) {
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace mshadow::expr;
@@ -324,105 +202,20 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
             kWriteTo, 0);
 
-{
-  dim_t* sum_counts_ptr = NULL;
-  int* num_runs_ptr = NULL;
-  mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-  mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
-  mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-  const auto oshape = ograd.shape_;
-  mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-      Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-  nnvm::dim_t* lookup_table = prefix_sum;
-  AddTakeGradLargeBatchKernelLaunch<true, true>(dst, sorted, index, src, sum_counts_ptr,
-                                           num_runs_ptr, lookup_table);
-}
-      });
-    });
-  });
-}
-
-template<>
-inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
-                                                  const TBlob& ograd,
-                                                  const TBlob& data,
-                                                  const OpReqType req,
-                                                  const NDArray& output) {
-  
-  int deterministic = dmlc::GetEnv("MXNET_DETERM", 0);
-  if (deterministic == 1) {
-    LOG(FATAL) << "NOT REACHED";
-    return;
-  } else if (deterministic == 2) {
-    SparseEmbeddingOpBackwardRspImpl3(ctx, ograd, data, req, output);
-    return;
-  } else if (deterministic == 3) {
-    SparseEmbeddingOpBackwardRspImpl4(ctx, ograd, data, req, output);
-    return;
-  }
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace mshadow::expr;
-  using namespace rowsparse;
-  using nnvm::dim_t;
-  if (req == kNullOp) return;
-  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
-                          << "weight gradient calculation with req != write";
-
-  // Request temporary storage for marking non-zero rows and prefix sum
-  Stream<gpu> *s = ctx.get_stream<gpu>();
-  dim_t num_rows = output.shape()[0];
-  dim_t row_length = output.shape()[1];
-  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
-  dim_t num_threads;
-
-  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
-      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
-        dim_t* prefix_sum = NULL;
-        void* d_temp_storage = NULL;
-        size_t temp_storage_bytes = 0;
-        cub::DeviceScan::InclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
-                                      prefix_sum,
-                                      prefix_sum,
-                                      num_rows,
-                                      Stream<gpu>::GetStream(s));
-        Tensor<gpu, 1, char> workspace = ctx.requested[0]
-            .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(dim_t) +
-                                           temp_storage_bytes), s);
-        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
-        d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
-        num_threads = num_rows;
-        Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
-        Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
-
-        cub::DeviceScan::InclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
-                                      prefix_sum,
-                                      prefix_sum,
-                                      num_rows,
-                                      mshadow::Stream<gpu>::GetStream(s));
-        dim_t nnr = 0;
-        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
-            cudaMemcpyDeviceToHost));
-        if (nnr == 0) {
-          FillZerosRspImpl(s, output);
-          return;
+        // reuse dense op backward kernel
+        {
+          dim_t* sum_counts_ptr = NULL;
+          int* num_runs_ptr = NULL;
+          mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
+          mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
+          mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
+          const auto oshape = ograd.shape_;
+          mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
+              Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+          nnvm::dim_t* lookup_table = prefix_sum;
+          AddTakeGradLargeBatchKernelLaunch<true>(dst, sorted, index, src, sum_counts_ptr,
+                                                  num_runs_ptr, lookup_table);
         }
-        output.CheckAndAlloc({Shape1(nnr)});
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        // fill row_idx array of output matrix, using the row_flg values
-        Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
-            grad_row_idx, prefix_sum, num_rows);
-        // prefill with zeros
-        DType* grad_data = output.data().dptr<DType>();
-        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
-            kWriteTo, 0);
-        // add the final gradients
-        num_threads = row_length * data_size;
-        Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s, num_threads, grad_data, prefix_sum,
-            data.dptr<IType>(), ograd.dptr<DType>(), row_length);
       });
     });
   });

From 948c5a35e124d69208127af17a0e149731697b79 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Feb 2018 09:09:30 +0000
Subject: [PATCH 07/18] remove unused kernel

---
 src/operator/tensor/indexing_op.cu | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 4f1a52562121..87633efaff84 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -41,25 +41,6 @@ struct is_valid_check {
   }
 };
 
-
-struct AddTakeGradRspGPUKernel {
-  template<typename DType, typename IType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             DType* out,
-                                             const nnvm::dim_t* prefix_sum,
-                                             const IType* data,
-                                             const DType* ograd,
-                                             const nnvm::dim_t row_length) {
-    using nnvm::dim_t;
-    const dim_t data_i = tid / row_length;
-    const dim_t grad_i = tid % row_length;
-    const dim_t irow = static_cast<dim_t>(data[data_i]);
-    const dim_t rsp_row = prefix_sum[irow] - 1;
-    const DType val = ograd[data_i * row_length + grad_i];
-    atomicAdd(static_cast<DType *>(&(out[rsp_row*row_length+grad_i])), val);
-  }
-};
-
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,

From 1d64ce27d05de9b52a794c3c83b3c13ae5de6e93 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sat, 24 Feb 2018 13:39:10 +0000
Subject: [PATCH 08/18] Revert "remove unused kernel"

This reverts commit 948c5a35e124d69208127af17a0e149731697b79.
---
 src/operator/tensor/indexing_op.cu | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 87633efaff84..4f1a52562121 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -41,6 +41,25 @@ struct is_valid_check {
   }
 };
 
+
+struct AddTakeGradRspGPUKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const nnvm::dim_t* prefix_sum,
+                                             const IType* data,
+                                             const DType* ograd,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    const dim_t data_i = tid / row_length;
+    const dim_t grad_i = tid % row_length;
+    const dim_t irow = static_cast<dim_t>(data[data_i]);
+    const dim_t rsp_row = prefix_sum[irow] - 1;
+    const DType val = ograd[data_i * row_length + grad_i];
+    atomicAdd(static_cast<DType *>(&(out[rsp_row*row_length+grad_i])), val);
+  }
+};
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,

From f57df63fff326a6963f3e6421ee83498cc2a883c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sat, 24 Feb 2018 13:45:37 +0000
Subject: [PATCH 09/18] Revert "remove old op"

This reverts commit 5d1cd64840676f25f50b3ac5ced849867af3112a.
---
 src/operator/tensor/indexing_op-inl.cuh |  20 +-
 src/operator/tensor/indexing_op.cu      | 245 ++++++++++++++++++++++--
 2 files changed, 237 insertions(+), 28 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index 4df1fd451ec5..b65de06afa91 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -38,7 +38,7 @@ namespace mxnet {
 namespace op {
 const int kWarpSize = 32;
 
-template<int SZ, bool lookup, typename DType, typename IdxType>
+template<int SZ, bool lookup, bool hb_offset, typename DType, typename IdxType>
 __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            // If idx_start == NULL, then in-kernel edge
                                            // detection is used
@@ -127,8 +127,9 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
     }
 
     const int start_feature = threadIdx.x + blockIdx.x * blockDim.x * SZ;
-    // Lookup inclusive prefix sum table if necessary
-    const int dst_row = (lookup ? (lookup_table[sorted_value] - 1) : sorted_value) * xmax;
+    // TODO remove -1
+    const int extra_off = hb_offset ? -1 : 0;
+    const int dst_row = (lookup ? (lookup_table[sorted_value]+extra_off) : sorted_value) * xmax;
 
     int num_idx = idx_end - idx_begin;
     int idx0 = idx_begin + threadIdx.y*num_idx/blockDim.y;
@@ -182,6 +183,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
         }
       }
     }
+
   }
 }
 
@@ -201,7 +203,7 @@ AddTakeGradLargeBatchWorkspaceSize(size_t num_keys) {
   return (unique_bytes + counts_bytes + num_runs_bytes + temporary_bytes);
 }
 
-template<bool lookup, typename IndexType, typename DType>
+template<bool lookup, bool hb_offset, typename IndexType, typename DType>
 inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst,
                                               const mshadow::Tensor<gpu, 1, IndexType>& sorted,
                                               const mshadow::Tensor<gpu, 1, IndexType>& index,
@@ -229,7 +231,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
 
   switch (SZ) {
     case 1:
-    AddTakeGradLargeBatchKernel<1, lookup, DType>
+    AddTakeGradLargeBatchKernel<1, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -237,7 +239,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 2:
-    AddTakeGradLargeBatchKernel<2, lookup, DType>
+    AddTakeGradLargeBatchKernel<2, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -245,7 +247,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 3:
-    AddTakeGradLargeBatchKernel<3, lookup, DType>
+    AddTakeGradLargeBatchKernel<3, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -253,7 +255,7 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
          static_cast<int>(src.size(1)), lookup_table);
     break;
     case 4:
-    AddTakeGradLargeBatchKernel<4, lookup, DType>
+    AddTakeGradLargeBatchKernel<4, lookup, hb_offset, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
@@ -319,7 +321,7 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
       sorted.size(0), stream);
   }
   nnvm::dim_t* lookup_table = nullptr;
-  AddTakeGradLargeBatchKernelLaunch<false>(dst, sorted, index, src, sum_counts_ptr,
+  AddTakeGradLargeBatchKernelLaunch<false, false>(dst, sorted, index, src, sum_counts_ptr,
                                            num_runs_ptr, lookup_table);
 }
 
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 4f1a52562121..b8c4ecc95cb8 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -103,12 +103,134 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
   }
 }
 
-template<>
-inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
-                                                  const TBlob& ograd,
-                                                  const TBlob& data,
-                                                  const OpReqType req,
-                                                  const NDArray& output) {
+
+struct mark_lookup_table {
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* out, const DType* data) {
+    out[static_cast<nnvm::dim_t>(data[i])] = i;
+  }
+};
+
+inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
+                                              const TBlob& ograd,
+                                              const TBlob& data,
+                                              const OpReqType req,
+                                              const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  if (data_size == 0) {
+    FillZerosRspImpl(s, output);
+    return;
+  }
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // temp resource declarations
+        dim_t* lookup_table = NULL;
+        void* temp_storage = NULL;
+        dim_t* sorted_data = NULL;
+        dim_t* original_idx = NULL;
+        // calculate number of bytes for temp resources
+        size_t lookup_table_bytes = num_rows * sizeof(dim_t);
+        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
+        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
+        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
+        size_t unique_workspace_bytes = 0;
+        // estimate unique temp space
+        IType* data_ptr = data.dptr<IType>();
+        size_t *null_ptr = nullptr;
+        cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
+          null_ptr, data_size, Stream<gpu>::GetStream(s));
+        // One more space reserved for unique count
+        size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
+                                               sort_workspace_size);
+        size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
+                                     original_idx_storage_bytes + temp_workspace_bytes;
+
+        // request resource and split it. layout is:
+        // lookup_table, sorted_data, original_idx, temp_storage
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
+        lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
+        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
+        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
+                                                sorted_data_storage_bytes);
+        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
+
+        // make a copy of the data, to be sorted
+        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
+        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
+        mxnet_op::copy(s, sorted_data_blob, data);
+
+        // generate original idx
+        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
+        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
+                                       kWriteTo, original_idx);
+        // sort data with its original idx
+        int num_bits = ilog2(num_rows - 1);
+        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
+        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
+                                                 Shape1(sort_workspace_size), s);
+        SortByKey(sorted_data_tensor, original_idx_tensor, true,
+                  &temp_storage_tensor, 0, num_bits);
+
+        // compute unique row ids based on sorted values.
+        output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
+
+        // fill row_idx array of output matrix, using the row_flg values
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // compute unique row ids 
+        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
+            grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
+
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
+            cudaMemcpyDeviceToHost));
+        CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
+        output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
+        output.set_aux_shape(rowsparse::kIdx, Shape1(nnr));
+
+        // generate lookup table
+        Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
+
+        // accumulate gradients
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+{
+        dim_t* sum_counts_ptr = NULL;
+        int* num_runs_ptr = NULL;
+        mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
+        mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
+        mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
+        const auto oshape = ograd.shape_;
+        mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
+            Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+        AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
+                                                 num_runs_ptr, lookup_table);
+}
+      });
+    });
+  });
+}
+
+inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
+                                              const TBlob& ograd,
+                                              const TBlob& data,
+                                              const OpReqType req,
+                                              const NDArray& output) {
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace mshadow::expr;
@@ -202,20 +324,105 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
             kWriteTo, 0);
 
-        // reuse dense op backward kernel
-        {
-          dim_t* sum_counts_ptr = NULL;
-          int* num_runs_ptr = NULL;
-          mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-          mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
-          mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-          const auto oshape = ograd.shape_;
-          mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-              Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-          nnvm::dim_t* lookup_table = prefix_sum;
-          AddTakeGradLargeBatchKernelLaunch<true>(dst, sorted, index, src, sum_counts_ptr,
-                                                  num_runs_ptr, lookup_table);
+{
+  dim_t* sum_counts_ptr = NULL;
+  int* num_runs_ptr = NULL;
+  mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
+  mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
+  mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
+  const auto oshape = ograd.shape_;
+  mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
+      Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+  nnvm::dim_t* lookup_table = prefix_sum;
+  AddTakeGradLargeBatchKernelLaunch<true, true>(dst, sorted, index, src, sum_counts_ptr,
+                                           num_runs_ptr, lookup_table);
+}
+      });
+    });
+  });
+}
+
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
+  
+  int deterministic = dmlc::GetEnv("MXNET_DETERM", 0);
+  if (deterministic == 1) {
+    LOG(FATAL) << "NOT REACHED";
+    return;
+  } else if (deterministic == 2) {
+    SparseEmbeddingOpBackwardRspImpl3(ctx, ograd, data, req, output);
+    return;
+  } else if (deterministic == 3) {
+    SparseEmbeddingOpBackwardRspImpl4(ctx, ograd, data, req, output);
+    return;
+  }
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  dim_t num_threads;
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        dim_t* prefix_sum = NULL;
+        void* d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      Stream<gpu>::GetStream(s));
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(dim_t) +
+                                           temp_storage_bytes), s);
+        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
+        d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
+        num_threads = num_rows;
+        Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
+        Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
+
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      mshadow::Stream<gpu>::GetStream(s));
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
+            cudaMemcpyDeviceToHost));
+        if (nnr == 0) {
+          FillZerosRspImpl(s, output);
+          return;
         }
+        output.CheckAndAlloc({Shape1(nnr)});
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // fill row_idx array of output matrix, using the row_flg values
+        Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
+            grad_row_idx, prefix_sum, num_rows);
+        // prefill with zeros
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+        // add the final gradients
+        num_threads = row_length * data_size;
+        Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s, num_threads, grad_data, prefix_sum,
+            data.dptr<IType>(), ograd.dptr<DType>(), row_length);
       });
     });
   });

From 9dd756d467fcdd65b2536c49eb9e5a9b3c9dabe2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sun, 25 Feb 2018 03:05:20 +0000
Subject: [PATCH 10/18] fix kernellaucnher

---
 src/operator/tensor/indexing_op-inl.cuh |   7 +-
 src/operator/tensor/indexing_op.cu      | 169 ++++++++++++++++++++++--
 2 files changed, 165 insertions(+), 11 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index b65de06afa91..50f2154db2a2 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -210,9 +210,10 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
                                               const mshadow::Tensor<gpu, 2, DType> &src,
                                               IndexType* sum_counts_ptr,
                                               int* num_runs_ptr,
-                                              const nnvm::dim_t* lookup_table) {
+                                              const nnvm::dim_t* lookup_table,
+                                              const mshadow::index_t num_rows) {
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
-  const int num_unique_est = min(dst.size(0), src.size(0));
+  const int num_unique_est = min(num_rows, src.size(0));
   const int max_nthread = 128;
   const int num_y = max(src.size(0)/num_unique_est, 1);
   const int block_dim_x = kWarpSize;
@@ -322,7 +323,7 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
   }
   nnvm::dim_t* lookup_table = nullptr;
   AddTakeGradLargeBatchKernelLaunch<false, false>(dst, sorted, index, src, sum_counts_ptr,
-                                           num_runs_ptr, lookup_table);
+                                           num_runs_ptr, lookup_table, dst.size(0));
 }
 
 }  // namespace op
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index b8c4ecc95cb8..c672eae329de 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -60,6 +60,40 @@ struct AddTakeGradRspGPUKernel {
   }
 };
 
+
+struct AddTakeGradRspGPUKernel2 {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int thread_id,
+                                             DType* out,
+                                             const nnvm::dim_t* prefix_sum,
+                                             const nnvm::dim_t* sorted_data,
+                                             const nnvm::dim_t data_size,
+                                             const nnvm::dim_t* original_idx,
+                                             const DType* ograd,
+                                             const nnvm::dim_t row_length,
+                                             const nnvm::dim_t num_threads_per_row,
+                                             const int prefix) {
+    using nnvm::dim_t;
+    auto tid = thread_id / num_threads_per_row;
+    auto feature_start = thread_id % num_threads_per_row * 4;
+    auto feature_end = feature_start + 4;
+    if (feature_end > row_length) feature_end = row_length;
+    if (tid == 0 || sorted_data[tid - 1] != sorted_data[tid]) {
+      do {
+        dim_t data = sorted_data[tid];
+        dim_t idx = original_idx[tid];
+        dim_t row_id = prefix_sum[data] - prefix;
+        dim_t ograd_offset = idx * row_length;
+        dim_t out_offset = row_id * row_length;
+        for (int i = feature_start; i < feature_end; i++) {
+          out[out_offset + i] += ograd[ograd_offset + i];
+        }
+        tid++;
+      } while (tid < data_size && sorted_data[tid - 1] == sorted_data[tid]);
+    }
+  }
+};
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -111,6 +145,124 @@ struct mark_lookup_table {
   }
 };
 
+inline void SparseEmbeddingOpBackwardRspImpl5(const OpContext& ctx,
+                                              const TBlob& ograd,
+                                              const TBlob& data,
+                                              const OpReqType req,
+                                              const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  if (data_size == 0) {
+    FillZerosRspImpl(s, output);
+    return;
+  }
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // temp resource declarations
+        dim_t* lookup_table = NULL;
+        void* temp_storage = NULL;
+        dim_t* sorted_data = NULL;
+        dim_t* original_idx = NULL;
+        // calculate number of bytes for temp resources
+        size_t lookup_table_bytes = num_rows * sizeof(dim_t);
+        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
+        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
+        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
+        size_t unique_workspace_bytes = 0;
+        // estimate unique temp space
+        IType* data_ptr = data.dptr<IType>();
+        size_t *null_ptr = nullptr;
+        cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
+          null_ptr, data_size, Stream<gpu>::GetStream(s));
+        // One more space reserved for unique count
+        size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
+                                               sort_workspace_size);
+        size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
+                                     original_idx_storage_bytes + temp_workspace_bytes;
+
+        // request resource and split it. layout is:
+        // lookup_table, sorted_data, original_idx, temp_storage
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
+        lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
+        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
+        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
+                                                sorted_data_storage_bytes);
+        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
+
+        // make a copy of the data, to be sorted
+        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
+        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
+        mxnet_op::copy(s, sorted_data_blob, data);
+
+        // generate original idx
+        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
+        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
+                                       kWriteTo, original_idx);
+        // sort data with its original idx
+        int num_bits = ilog2(num_rows - 1);
+        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
+        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
+                                                 Shape1(sort_workspace_size), s);
+        SortByKey(sorted_data_tensor, original_idx_tensor, true,
+                  &temp_storage_tensor, 0, num_bits);
+
+        // compute unique row ids based on sorted values.
+        output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
+
+        // fill row_idx array of output matrix, using the row_flg values
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
+            grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
+
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
+            cudaMemcpyDeviceToHost));
+        CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
+        output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
+        output.set_aux_shape(rowsparse::kIdx, Shape1(nnr));
+
+        // generate lookup table
+        Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
+
+        // accumulate gradients
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+        const nnvm::dim_t num_threads_per_row = (row_length + 3) / 4;
+        Kernel<AddTakeGradRspGPUKernel2, gpu>::Launch(s, data_size * num_threads_per_row, grad_data, lookup_table,
+               sorted_data, data_size, original_idx, ograd.dptr<DType>(), row_length, num_threads_per_row, 0);
+
+//{
+        //dim_t* sum_counts_ptr = NULL;
+        //int* num_runs_ptr = NULL;
+        //mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
+        //mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
+        //mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
+        //const auto oshape = ograd.shape_;
+        //mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
+        //    Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+        //AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
+        //                                         num_runs_ptr, lookup_table);
+//}
+      });
+    });
+  });
+}
+
 inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
                                               const TBlob& ograd,
                                               const TBlob& data,
@@ -191,7 +343,6 @@ inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
 
         // fill row_idx array of output matrix, using the row_flg values
         RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        // compute unique row ids 
         cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
             grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
 
@@ -219,7 +370,7 @@ inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
         mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
             Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
         AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
-                                                 num_runs_ptr, lookup_table);
+                                                 num_runs_ptr, lookup_table, num_rows);
 }
       });
     });
@@ -335,7 +486,7 @@ inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
       Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
   nnvm::dim_t* lookup_table = prefix_sum;
   AddTakeGradLargeBatchKernelLaunch<true, true>(dst, sorted, index, src, sum_counts_ptr,
-                                           num_runs_ptr, lookup_table);
+                                           num_runs_ptr, lookup_table, num_rows);
 }
       });
     });
@@ -350,15 +501,17 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
                                                   const NDArray& output) {
   
   int deterministic = dmlc::GetEnv("MXNET_DETERM", 0);
-  if (deterministic == 1) {
-    LOG(FATAL) << "NOT REACHED";
-    return;
-  } else if (deterministic == 2) {
+  if (deterministic == 3) {
     SparseEmbeddingOpBackwardRspImpl3(ctx, ograd, data, req, output);
     return;
-  } else if (deterministic == 3) {
+  } else if (deterministic == 4) {
     SparseEmbeddingOpBackwardRspImpl4(ctx, ograd, data, req, output);
     return;
+  } else if (deterministic == 5) {
+    SparseEmbeddingOpBackwardRspImpl5(ctx, ograd, data, req, output);
+    return;
+  } else if (deterministic != 0) {
+    LOG(FATAL) << "NOT REACHED";
   }
   using namespace mshadow;
   using namespace mxnet_op;

From f40bf9cd4a5421909579b3739c43e4b8211ec782 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sun, 25 Feb 2018 09:28:12 +0000
Subject: [PATCH 11/18] add force_determ option

---
 src/operator/tensor/indexing_op-inl.cuh |  34 ++-
 src/operator/tensor/indexing_op.cc      |  20 +-
 src/operator/tensor/indexing_op.cu      | 297 ++----------------------
 src/operator/tensor/indexing_op.h       |  35 ++-
 4 files changed, 82 insertions(+), 304 deletions(-)

diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index 50f2154db2a2..34cc26302548 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -38,7 +38,7 @@ namespace mxnet {
 namespace op {
 const int kWarpSize = 32;
 
-template<int SZ, bool lookup, bool hb_offset, typename DType, typename IdxType>
+template<int SZ, typename DType, typename IdxType>
 __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            // If idx_start == NULL, then in-kernel edge
                                            // detection is used
@@ -47,9 +47,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
                                            const int* idx_start_size_ptr,
                                            const IdxType *sorted, const IdxType *index,
                                            const DType *src,
-                                           int ymax, int xmax,
-                                           // table to look up positions of row_ids in dst
-                                           const nnvm::dim_t *lookup_table) {
+                                           int ymax, int xmax) {
   // Size of the shared memory is [blockDim.x*SZ*blockDim.y]*sizeof(DType)
   extern __shared__ char sh_grad_weight_char[];
   DType* sh_grad_weight = (DType*)sh_grad_weight_char;
@@ -127,9 +125,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
     }
 
     const int start_feature = threadIdx.x + blockIdx.x * blockDim.x * SZ;
-    // TODO remove -1
-    const int extra_off = hb_offset ? -1 : 0;
-    const int dst_row = (lookup ? (lookup_table[sorted_value]+extra_off) : sorted_value) * xmax;
+    const int dst_row = sorted_value * xmax;
 
     int num_idx = idx_end - idx_begin;
     int idx0 = idx_begin + threadIdx.y*num_idx/blockDim.y;
@@ -203,14 +199,13 @@ AddTakeGradLargeBatchWorkspaceSize(size_t num_keys) {
   return (unique_bytes + counts_bytes + num_runs_bytes + temporary_bytes);
 }
 
-template<bool lookup, bool hb_offset, typename IndexType, typename DType>
+template<typename IndexType, typename DType>
 inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst,
                                               const mshadow::Tensor<gpu, 1, IndexType>& sorted,
                                               const mshadow::Tensor<gpu, 1, IndexType>& index,
                                               const mshadow::Tensor<gpu, 2, DType> &src,
                                               IndexType* sum_counts_ptr,
                                               int* num_runs_ptr,
-                                              const nnvm::dim_t* lookup_table,
                                               const mshadow::index_t num_rows) {
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(dst.stream_);
   const int num_unique_est = min(num_rows, src.size(0));
@@ -232,36 +227,36 @@ inline void AddTakeGradLargeBatchKernelLaunch(mshadow::Tensor<gpu, 2, DType> dst
 
   switch (SZ) {
     case 1:
-    AddTakeGradLargeBatchKernel<1, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<1, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
          static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
+         static_cast<int>(src.size(1)));
     break;
     case 2:
-    AddTakeGradLargeBatchKernel<2, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<2, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
          static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
+         static_cast<int>(src.size(1)));
     break;
     case 3:
-    AddTakeGradLargeBatchKernel<3, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<3, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
          static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
+         static_cast<int>(src.size(1)));
     break;
     case 4:
-    AddTakeGradLargeBatchKernel<4, lookup, hb_offset, DType>
+    AddTakeGradLargeBatchKernel<4, DType>
         <<<dimGrid, dimBlock, shmem_size, stream>>>
         (dst.dptr_, sum_counts_ptr, num_runs_ptr,
          sorted.dptr_, index.dptr_, src.dptr_,
          static_cast<int>(src.size(0)),
-         static_cast<int>(src.size(1)), lookup_table);
+         static_cast<int>(src.size(1)));
     break;
     default:
     LOG(FATAL) << "AddTakeGradLargeBatch, incorrect value SZ " << SZ;
@@ -321,9 +316,8 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
     (temporary_storage, temporary_bytes, counts_out_ptr, sum_counts_ptr,
       sorted.size(0), stream);
   }
-  nnvm::dim_t* lookup_table = nullptr;
-  AddTakeGradLargeBatchKernelLaunch<false, false>(dst, sorted, index, src, sum_counts_ptr,
-                                           num_runs_ptr, lookup_table, dst.size(0));
+  AddTakeGradLargeBatchKernelLaunch(dst, sorted, index, src, sum_counts_ptr,
+                                    num_runs_ptr, dst.size(0));
 }
 
 }  // namespace op
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index cce4537ae3a2..2bc0936e059d 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -70,7 +70,8 @@ void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
 
 
 template<>
-inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const OpContext& ctx,
+inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const SparseEmbeddingParam& param,
+                                                  const OpContext& ctx,
                                                   const TBlob& ograd,
                                                   const TBlob& data,
                                                   const OpReqType req,
@@ -178,6 +179,7 @@ GatherNDBackwardImpl(int N, int M, int K,
 }
 
 DMLC_REGISTER_PARAMETER(EmbeddingParam);
+DMLC_REGISTER_PARAMETER(SparseEmbeddingParam);
 DMLC_REGISTER_PARAMETER(TakeParam);
 DMLC_REGISTER_PARAMETER(OneHotParam);
 DMLC_REGISTER_PARAMETER(ScatterNDParam);
@@ -230,8 +232,8 @@ Examples::
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data", "weight"};
   })
-.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape)
-.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType)
+.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape<EmbeddingParam>)
+.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType<EmbeddingParam>)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
@@ -268,6 +270,11 @@ The storage type of weight must be `row_sparse`, and the gradient of the weight
 
     `SparseEmbedding` is designed for the use case where `input_dim` is very large (e.g. 100k).
     The operator is available on both CPU and GPU.
+    When `force_deterministic` is set to `True`, the accumulation of gradients follows a
+    deterministic order if a feature appears multiple times in the input. However, the
+    backward computation is usually slower when it is set to `True`.
+    When the operator is used in recurrent neural network models on the GPU,
+    the recommended value for `force_deterministic` is `True`.
 
 Examples::
 
@@ -294,7 +301,7 @@ Examples::
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<EmbeddingParam>)
+.set_attr_parser(ParamParser<SparseEmbeddingParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data", "weight"};
@@ -303,8 +310,8 @@ Examples::
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape)
-.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType)
+.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape<SparseEmbeddingParam>)
+.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType<SparseEmbeddingParam>)
 .set_attr<FInferStorageType>("FInferStorageType", SparseEmbeddingOpForwardStorageType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpForwardEx<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
@@ -327,6 +334,7 @@ NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
 
 NNVM_REGISTER_OP(_backward_SparseEmbedding)
+.set_attr_parser(ParamParser<SparseEmbeddingParam>)
 .set_num_inputs(2)
 .set_num_outputs(2)
 .set_attr<FResourceRequest>("FResourceRequest",
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index c672eae329de..32011674d48f 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -61,7 +61,7 @@ struct AddTakeGradRspGPUKernel {
 };
 
 
-struct AddTakeGradRspGPUKernel2 {
+struct AddTakeGradRspDeterministicKernel {
   template<typename DType>
   __device__ __forceinline__ static void Map(int thread_id,
                                              DType* out,
@@ -72,19 +72,19 @@ struct AddTakeGradRspGPUKernel2 {
                                              const DType* ograd,
                                              const nnvm::dim_t row_length,
                                              const nnvm::dim_t num_threads_per_row,
-                                             const int prefix) {
+                                             const int SZ) {
     using nnvm::dim_t;
-    auto tid = thread_id / num_threads_per_row;
-    auto feature_start = thread_id % num_threads_per_row * 4;
-    auto feature_end = feature_start + 4;
+    int tid = thread_id / num_threads_per_row;
+    const int feature_start = thread_id % num_threads_per_row * SZ;
+    int feature_end = feature_start + SZ;
     if (feature_end > row_length) feature_end = row_length;
     if (tid == 0 || sorted_data[tid - 1] != sorted_data[tid]) {
       do {
-        dim_t data = sorted_data[tid];
-        dim_t idx = original_idx[tid];
-        dim_t row_id = prefix_sum[data] - prefix;
-        dim_t ograd_offset = idx * row_length;
-        dim_t out_offset = row_id * row_length;
+        const dim_t data = sorted_data[tid];
+        const dim_t idx = original_idx[tid];
+        const dim_t row_id = prefix_sum[data];
+        const dim_t ograd_offset = idx * row_length;
+        const dim_t out_offset = row_id * row_length;
         for (int i = feature_start; i < feature_end; i++) {
           out[out_offset + i] += ograd[ograd_offset + i];
         }
@@ -145,11 +145,11 @@ struct mark_lookup_table {
   }
 };
 
-inline void SparseEmbeddingOpBackwardRspImpl5(const OpContext& ctx,
-                                              const TBlob& ograd,
-                                              const TBlob& data,
-                                              const OpReqType req,
-                                              const NDArray& output) {
+inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
+                                                          const TBlob& ograd,
+                                                          const TBlob& data,
+                                                          const OpReqType req,
+                                                          const NDArray& output) {
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace expr;
@@ -242,276 +242,27 @@ inline void SparseEmbeddingOpBackwardRspImpl5(const OpContext& ctx,
         DType* grad_data = output.data().dptr<DType>();
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
             kWriteTo, 0);
-        const nnvm::dim_t num_threads_per_row = (row_length + 3) / 4;
-        Kernel<AddTakeGradRspGPUKernel2, gpu>::Launch(s, data_size * num_threads_per_row, grad_data, lookup_table,
-               sorted_data, data_size, original_idx, ograd.dptr<DType>(), row_length, num_threads_per_row, 0);
-
-//{
-        //dim_t* sum_counts_ptr = NULL;
-        //int* num_runs_ptr = NULL;
-        //mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-        //mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
-        //mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-        //const auto oshape = ograd.shape_;
-        //mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-        //    Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-        //AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
-        //                                         num_runs_ptr, lookup_table);
-//}
+        const int SZ = 4;
+        const nnvm::dim_t num_threads_per_row = (row_length + SZ - 1) / SZ;
+        Kernel<AddTakeGradRspDeterministicKernel, gpu>::Launch(s, data_size * num_threads_per_row,
+                           grad_data, lookup_table, sorted_data, data_size, original_idx,
+                           ograd.dptr<DType>(), row_length, num_threads_per_row, SZ);
       });
     });
   });
 }
 
-inline void SparseEmbeddingOpBackwardRspImpl4(const OpContext& ctx,
-                                              const TBlob& ograd,
-                                              const TBlob& data,
-                                              const OpReqType req,
-                                              const NDArray& output) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace expr;
-  using namespace rowsparse;
-  using nnvm::dim_t;
-  if (req == kNullOp) return;
-  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
-                          << "weight gradient calculation with req != write";
-
-  Stream<gpu> *s = ctx.get_stream<gpu>();
-  dim_t num_rows = output.shape()[0];
-  dim_t row_length = output.shape()[1];
-  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
-  if (data_size == 0) {
-    FillZerosRspImpl(s, output);
-    return;
-  }
-
-  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
-      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
-        // temp resource declarations
-        dim_t* lookup_table = NULL;
-        void* temp_storage = NULL;
-        dim_t* sorted_data = NULL;
-        dim_t* original_idx = NULL;
-        // calculate number of bytes for temp resources
-        size_t lookup_table_bytes = num_rows * sizeof(dim_t);
-        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
-        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
-        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
-        size_t unique_workspace_bytes = 0;
-        // estimate unique temp space
-        IType* data_ptr = data.dptr<IType>();
-        size_t *null_ptr = nullptr;
-        cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
-          null_ptr, data_size, Stream<gpu>::GetStream(s));
-        // One more space reserved for unique count
-        size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
-                                               sort_workspace_size);
-        size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
-                                     original_idx_storage_bytes + temp_workspace_bytes;
-
-        // request resource and split it. layout is:
-        // lookup_table, sorted_data, original_idx, temp_storage
-        Tensor<gpu, 1, char> workspace = ctx.requested[0]
-            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
-        lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
-        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
-        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
-                                                sorted_data_storage_bytes);
-        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
-
-        // make a copy of the data, to be sorted
-        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
-        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
-        mxnet_op::copy(s, sorted_data_blob, data);
-
-        // generate original idx
-        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
-        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
-                                       kWriteTo, original_idx);
-        // sort data with its original idx
-        int num_bits = ilog2(num_rows - 1);
-        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
-        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
-                                                 Shape1(sort_workspace_size), s);
-        SortByKey(sorted_data_tensor, original_idx_tensor, true,
-                  &temp_storage_tensor, 0, num_bits);
-
-        // compute unique row ids based on sorted values.
-        output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
-
-        // fill row_idx array of output matrix, using the row_flg values
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
-            grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
-
-        dim_t nnr = 0;
-        CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
-            cudaMemcpyDeviceToHost));
-        CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
-        output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
-        output.set_aux_shape(rowsparse::kIdx, Shape1(nnr));
-
-        // generate lookup table
-        Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
-
-        // accumulate gradients
-        DType* grad_data = output.data().dptr<DType>();
-        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
-            kWriteTo, 0);
-{
-        dim_t* sum_counts_ptr = NULL;
-        int* num_runs_ptr = NULL;
-        mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-        mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
-        mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-        const auto oshape = ograd.shape_;
-        mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-            Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-        AddTakeGradLargeBatchKernelLaunch<true, false>(dst, sorted, index, src, sum_counts_ptr,
-                                                 num_runs_ptr, lookup_table, num_rows);
-}
-      });
-    });
-  });
-}
-
-inline void SparseEmbeddingOpBackwardRspImpl3(const OpContext& ctx,
-                                              const TBlob& ograd,
-                                              const TBlob& data,
-                                              const OpReqType req,
-                                              const NDArray& output) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace mshadow::expr;
-  using namespace rowsparse;
-  using nnvm::dim_t;
-  if (req == kNullOp) return;
-  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
-                          << "weight gradient calculation with req != write";
-
-  // Request temporary storage for marking non-zero rows and prefix sum
-  Stream<gpu> *s = ctx.get_stream<gpu>();
-  dim_t num_rows = output.shape()[0];
-  dim_t row_length = output.shape()[1];
-  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
-  dim_t num_threads;
-  if (data_size == 0) {
-    FillZerosRspImpl(s, output);
-    return;
-  }
-  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
-      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
-        dim_t* prefix_sum = NULL;
-        void* temp_storage = NULL;
-        dim_t* sorted_data = NULL;
-        dim_t* original_idx = NULL;
-        // calculate resource bytes
-        size_t row_flg_storage_bytes = num_rows * sizeof(dim_t);
-        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
-        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
-        size_t sum_workspace_bytes = 0;
-        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
-        cub::DeviceScan::InclusiveSum(temp_storage,
-                                      sum_workspace_bytes,
-                                      prefix_sum,
-                                      prefix_sum,
-                                      num_rows,
-                                      Stream<gpu>::GetStream(s));
-        // temp_workspace is shared by inclusive sum and sort
-        size_t temp_workspace_bytes = std::max(sum_workspace_bytes, sort_workspace_size);
-        size_t total_storage_bytes = row_flg_storage_bytes + sorted_data_storage_bytes +
-                                     original_idx_storage_bytes + temp_workspace_bytes;
-
-        // request resource and split it. layout =
-        // row_flg/prefixsum, sorted_data, original_idx, temp_storage
-        Tensor<gpu, 1, char> workspace = ctx.requested[0]
-            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
-        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
-        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + row_flg_storage_bytes);
-        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + row_flg_storage_bytes +
-                                                sorted_data_storage_bytes);
-        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
-        // compute row flags and prefix sum
-        num_threads = num_rows;
-        Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
-        Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
-        cub::DeviceScan::InclusiveSum(temp_storage,
-                                      temp_workspace_bytes,
-                                      prefix_sum,
-                                      prefix_sum,
-                                      num_rows,
-                                      mshadow::Stream<gpu>::GetStream(s));
-        // retrieve nnr and allocate output
-        dim_t nnr = 0;
-        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
-            cudaMemcpyDeviceToHost));
-        output.CheckAndAlloc({Shape1(nnr)});
-        // fill row_idx array of output matrix, using the row_flg values
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
-            grad_row_idx, prefix_sum, num_rows);
-
-        // make a copy of the data, to be sorted
-        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
-        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
-        mxnet_op::copy(s, sorted_data_blob, data);
-
-        // generate original idx
-        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
-        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
-                                       kWriteTo, original_idx);
-        // sort data with its original idx
-        int num_bits = ilog2(num_rows - 1);
-        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
-        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
-                                                 Shape1(sort_workspace_size), s);
-        SortByKey(sorted_data_tensor, original_idx_tensor, true,
-                  &temp_storage_tensor, 0, num_bits);
-        // accumulate gradients
-        DType* grad_data = output.data().dptr<DType>();
-        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
-            kWriteTo, 0);
-
-{
-  dim_t* sum_counts_ptr = NULL;
-  int* num_runs_ptr = NULL;
-  mshadow::Tensor<gpu, 2, DType> dst = output.data().get<gpu, 2, DType>(s);
-  mshadow::Tensor<gpu, 1, dim_t> sorted = sorted_data_tensor;
-  mshadow::Tensor<gpu, 1, dim_t> index = original_idx_tensor;
-  const auto oshape = ograd.shape_;
-  mshadow::Tensor<gpu, 2, DType> src = ograd.get_with_shape<gpu, 2, DType>(
-      Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-  nnvm::dim_t* lookup_table = prefix_sum;
-  AddTakeGradLargeBatchKernelLaunch<true, true>(dst, sorted, index, src, sum_counts_ptr,
-                                           num_runs_ptr, lookup_table, num_rows);
-}
-      });
-    });
-  });
-}
 
 template<>
-inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
+inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const SparseEmbeddingParam& param,
+                                                  const OpContext& ctx,
                                                   const TBlob& ograd,
                                                   const TBlob& data,
                                                   const OpReqType req,
                                                   const NDArray& output) {
-  
-  int deterministic = dmlc::GetEnv("MXNET_DETERM", 0);
-  if (deterministic == 3) {
-    SparseEmbeddingOpBackwardRspImpl3(ctx, ograd, data, req, output);
-    return;
-  } else if (deterministic == 4) {
-    SparseEmbeddingOpBackwardRspImpl4(ctx, ograd, data, req, output);
-    return;
-  } else if (deterministic == 5) {
-    SparseEmbeddingOpBackwardRspImpl5(ctx, ograd, data, req, output);
+  if (param.force_deterministic) {
+    SparseEmbeddingOpBackwardDeterministicRspImpl(ctx, ograd, data, req, output);
     return;
-  } else if (deterministic != 0) {
-    LOG(FATAL) << "NOT REACHED";
   }
   using namespace mshadow;
   using namespace mxnet_op;
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 1888a4179729..d26ed749e654 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -57,6 +57,28 @@ enum EmbeddingOpResource {kTempSpace};
 }  // namespace embedding
 
 
+struct SparseEmbeddingParam: public dmlc::Parameter<SparseEmbeddingParam> {
+  int input_dim;
+  int output_dim;
+  int dtype;
+  bool force_deterministic;
+  DMLC_DECLARE_PARAMETER(SparseEmbeddingParam) {
+    DMLC_DECLARE_FIELD(input_dim).set_lower_bound(1)
+    .describe("Vocabulary size of the input indices.");
+    DMLC_DECLARE_FIELD(output_dim).set_lower_bound(1)
+    .describe("Dimension of the embedding vectors.");
+    DMLC_DECLARE_FIELD(dtype).set_default(mshadow::kFloat32)
+    .add_enum("float32", mshadow::kFloat32)
+    .add_enum("float64", mshadow::kFloat64)
+    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("uint8", mshadow::kUint8)
+    .add_enum("int32", mshadow::kInt32)
+    .describe("Data type of weight.");
+    DMLC_DECLARE_FIELD(force_deterministic).set_default(false)
+    .describe("Force the gradient computation to be executed according to a deterministic order.");
+  }
+};
+
 struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
   int input_dim;
   int output_dim;
@@ -130,14 +152,14 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
                                   const mshadow::Tensor<gpu, 1, IndexType>& index,
                                   const mshadow::Tensor<gpu, 2, DType> &src,
                                   mshadow::Tensor<gpu, 1, char>* workspace = NULL);
-
+template<typename ParamType>
 inline bool EmbeddingOpShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
   using namespace mshadow;
   const TShape &dshape = (*in_attrs)[embedding::kData];
   if (dshape.ndim() ==  0) return false;
-  const EmbeddingParam& param = nnvm::get<EmbeddingParam>(attrs.parsed);
+  const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*in_attrs, embedding::kWeight, Shape2(param.input_dim,
                                                            param.output_dim));
   out_attrs->clear();
@@ -152,10 +174,11 @@ inline bool EmbeddingOpShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<typename ParamType>
 inline bool EmbeddingOpType(const nnvm::NodeAttrs& attrs,
                             std::vector<int> *in_type,
                             std::vector<int> *out_type) {
-  const EmbeddingParam& param = nnvm::get<EmbeddingParam>(attrs.parsed);
+  const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   CHECK_EQ(in_type->size(), 2U);
   CHECK_GE(out_type->size(), 1U);
   int itype = (*in_type)[0];
@@ -560,7 +583,8 @@ struct AddTakeGradRspKernel {
 };
 
 template<typename xpu>
-inline void SparseEmbeddingOpBackwardRspImpl(const OpContext& ctx,
+inline void SparseEmbeddingOpBackwardRspImpl(const SparseEmbeddingParam& param,
+                                             const OpContext& ctx,
                                              const TBlob& ograd,
                                              const TBlob& data,
                                              const OpReqType req,
@@ -582,9 +606,10 @@ void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
   // check req
   CHECK_EQ(req[embedding::kData], kNullOp)
           << "SparseEmbedding layer doesn't support calculate data gradient";
+  const SparseEmbeddingParam& param = nnvm::get<SparseEmbeddingParam>(attrs.parsed);
   if (data.storage_type() == kDefaultStorage && ograd.storage_type() == kDefaultStorage &&
       weight_grad.storage_type() == kRowSparseStorage) {
-    SparseEmbeddingOpBackwardRspImpl<xpu>(ctx, ograd.data(), data.data(),
+    SparseEmbeddingOpBackwardRspImpl<xpu>(param, ctx, ograd.data(), data.data(),
                                           req[embedding::kWeight], weight_grad);
   } else {
     LogUnimplementedOp(attrs, ctx, inputs, req, outputs);

From fd1f619709363c73eab2223ef9ef4c17541e40cd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sun, 25 Feb 2018 09:46:44 +0000
Subject: [PATCH 12/18] add doc

---
 src/operator/tensor/indexing_op.cc |  2 +-
 src/operator/tensor/indexing_op.cu | 39 +++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 2bc0936e059d..4e21f9c5f7f1 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -272,7 +272,7 @@ The storage type of weight must be `row_sparse`, and the gradient of the weight
     The operator is available on both CPU and GPU.
     When `force_deterministic` is set to `True`, the accumulation of gradients follows a
     deterministic order if a feature appears multiple times in the input. However, the
-    backward computation is usually slower when it is set to `True`.
+    accumulation is usually slower when the order is enforced.
     When the operator is used in recurrent neural network models on the GPU,
     the recommended value for `force_deterministic` is `True`.
 
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 32011674d48f..1a217ec1dee7 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -60,12 +60,23 @@ struct AddTakeGradRspGPUKernel {
   }
 };
 
-
+/*
+ * \brief kernel for backward computation for take, executed with deterministic order
+ * \param thread_id the thread id
+ * \param out the output gradient data
+ * \param lookup_table the table to lookup the position of an id in gradient array
+ * \param sorted_data the sorted data input
+ * \param original_idx the original indices of the sorted data input
+ * \param ograd head gradient
+ * \param row_length the output dimension
+ * \param num_threads_per_row the number of threads to process a row together
+ * \param SZ the number of features a thread is responsible for
+ */
 struct AddTakeGradRspDeterministicKernel {
   template<typename DType>
   __device__ __forceinline__ static void Map(int thread_id,
                                              DType* out,
-                                             const nnvm::dim_t* prefix_sum,
+                                             const nnvm::dim_t* lookup_table,
                                              const nnvm::dim_t* sorted_data,
                                              const nnvm::dim_t data_size,
                                              const nnvm::dim_t* original_idx,
@@ -82,7 +93,7 @@ struct AddTakeGradRspDeterministicKernel {
       do {
         const dim_t data = sorted_data[tid];
         const dim_t idx = original_idx[tid];
-        const dim_t row_id = prefix_sum[data];
+        const dim_t row_id = lookup_table[data];
         const dim_t ograd_offset = idx * row_length;
         const dim_t out_offset = row_id * row_length;
         for (int i = feature_start; i < feature_end; i++) {
@@ -94,6 +105,20 @@ struct AddTakeGradRspDeterministicKernel {
   }
 };
 
+/*
+ * \brief the kernel to generate a lookup table for positions of row ids
+ * \param i thread id
+ * \param out output table
+ * \param data the input row id in sorted order
+ */
+struct mark_lookup_table {
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* out, const DType* data) {
+    out[static_cast<nnvm::dim_t>(data[i])] = i;
+  }
+};
+
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -137,14 +162,6 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
   }
 }
 
-
-struct mark_lookup_table {
-  template<typename IType, typename DType>
-  MSHADOW_XINLINE static void Map(int i, IType* out, const DType* data) {
-    out[static_cast<nnvm::dim_t>(data[i])] = i;
-  }
-};
-
 inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
                                                           const TBlob& ograd,
                                                           const TBlob& data,

From e87697a86e1b63c08ad6bb29dd851cb916d719f7 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sun, 25 Feb 2018 11:50:50 +0000
Subject: [PATCH 13/18] fix lint

---
 src/operator/tensor/indexing_op.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 1a217ec1dee7..e27aa49877e8 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -227,8 +227,8 @@ inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
 
         // generate original idx
         Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
-        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
-                                       kWriteTo, original_idx);
+        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0),
+                                       static_cast<dim_t>(1), kWriteTo, original_idx);
         // sort data with its original idx
         int num_bits = ilog2(num_rows - 1);
         char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
@@ -242,8 +242,8 @@ inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
 
         // fill row_idx array of output matrix, using the row_flg values
         RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data, grad_row_idx,
-            grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
+        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data,
+            grad_row_idx, grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
 
         dim_t nnr = 0;
         CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),

From 1feb99d88bc4a3b6d40cc411697f1a3fc662f91d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-29-197.us-west-2.compute.internal>
Date: Sun, 25 Feb 2018 12:10:36 +0000
Subject: [PATCH 14/18] update test

---
 tests/python/unittest/test_sparse_operator.py | 64 ++++++++++---------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index b9447626d081..8fea38d9c033 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1616,42 +1616,44 @@ def check_sparse_elementwise_sum_with_shape(stype, shape, n):
 
 def test_sparse_embedding():
     ''' test sparse embedding operator '''
-    def check_sparse_embedding(executor, weight_ref, data_onehot, grad, density):
-        # update weight based on density
-        weight[:] = rand_ndarray(weight.shape, 'row_sparse', density=density)
-        # check forward
-        executor.forward(is_train=True)
-        assert_almost_equal(executor.outputs[0].asnumpy(), np.dot(data_onehot, weight.asnumpy()))
-        # check backward
-        executor.backward([grad])
-        assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(data_onehot.T, grad.asnumpy()))
+    def check_sparse_embedding(in_dim, out_dim, batch, densities, deterministic):
+        # init executor
+        data = mx.sym.Variable("data")
+        weight = mx.sym.Variable("embed_weight", stype='row_sparse')
+        embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
+                                               output_dim=out_dim, force_deterministic=deterministic,
+                                               name="embed")
+        grad_req = {'data': 'null', 'embed_weight': 'write'}
+        exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
+        arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
+        grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
+        # init data
+        np_data = np.random.randint(low=0, high=in_dim, size=batch)
+        np_onehot = np.zeros((batch, in_dim)).astype(np.float32)
+        np_onehot[np.arange(batch), np_data] = 1.0
+        arg_map["data"][:] = np_data
+        # init grad
+        np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
+        grad = mx.nd.zeros(np_grad.shape)
+        grad[:] = np_grad
+        # weight
+        weight = arg_map["embed_weight"]
+        for density in densities:
+            # update weight based on density
+            weight[:] = rand_ndarray(weight.shape, 'row_sparse', density=density)
+            # check forward
+            exe_test.forward(is_train=True)
+            assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, weight.asnumpy()))
+            # check backward
+            exe_test.backward([grad])
+            assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, grad.asnumpy()))
 
     densities = [0, 0.5, 1]
     in_dim = 50
     out_dim = 3
     batch = 8
-    # init executor
-    data = mx.sym.Variable("data")
-    weight = mx.sym.Variable("embed_weight", stype='row_sparse')
-    embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
-                                           output_dim=out_dim, name="embed")
-    grad_req = {'data': 'null', 'embed_weight': 'write'}
-    exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
-    arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
-    grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
-    # init data
-    np_data = np.random.randint(low=0, high=in_dim, size=batch)
-    np_onehot = np.zeros((batch, in_dim))
-    np_onehot[np.arange(batch), np_data] = 1.0
-    arg_map["data"][:] = np_data
-    # init grad
-    np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
-    grad = mx.nd.zeros(np_grad.shape)
-    grad[:] = np_grad
-    # weight
-    weight = arg_map["embed_weight"]
-    for density in densities:
-        check_sparse_embedding(exe_test, weight, np_onehot, grad, density)
+    check_sparse_embedding(in_dim, out_dim, batch, densities, True)
+    check_sparse_embedding(in_dim, out_dim, batch, densities, False)
 
 
 def test_scatter_ops():

From b6c6fce2f0249f5273766d7521fbfeccd31d5c65 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Tue, 27 Feb 2018 13:38:58 +0000
Subject: [PATCH 15/18] CR comments

---
 src/operator/tensor/indexing_op.cc            |   4 +-
 src/operator/tensor/indexing_op.cu            | 213 ++++++++++--------
 src/operator/tensor/indexing_op.h             |   7 +-
 tests/python/unittest/test_sparse_operator.py |   2 +-
 4 files changed, 124 insertions(+), 102 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 4e21f9c5f7f1..bb65419a79c8 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -270,11 +270,11 @@ The storage type of weight must be `row_sparse`, and the gradient of the weight
 
     `SparseEmbedding` is designed for the use case where `input_dim` is very large (e.g. 100k).
     The operator is available on both CPU and GPU.
-    When `force_deterministic` is set to `True`, the accumulation of gradients follows a
+    When `deterministic` is set to `True`, the accumulation of gradients follows a
     deterministic order if a feature appears multiple times in the input. However, the
     accumulation is usually slower when the order is enforced.
     When the operator is used in recurrent neural network models on the GPU,
-    the recommended value for `force_deterministic` is `True`.
+    the recommended value for `deterministic` is `True`.
 
 Examples::
 
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index e27aa49877e8..85a450b90c12 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -72,6 +72,7 @@ struct AddTakeGradRspGPUKernel {
  * \param num_threads_per_row the number of threads to process a row together
  * \param SZ the number of features a thread is responsible for
  */
+template<int SZ>
 struct AddTakeGradRspDeterministicKernel {
   template<typename DType>
   __device__ __forceinline__ static void Map(int thread_id,
@@ -82,25 +83,34 @@ struct AddTakeGradRspDeterministicKernel {
                                              const nnvm::dim_t* original_idx,
                                              const DType* ograd,
                                              const nnvm::dim_t row_length,
-                                             const nnvm::dim_t num_threads_per_row,
-                                             const int SZ) {
+                                             const nnvm::dim_t num_threads_per_row) {
     using nnvm::dim_t;
     int tid = thread_id / num_threads_per_row;
     const int feature_start = thread_id % num_threads_per_row * SZ;
-    int feature_end = feature_start + SZ;
-    if (feature_end > row_length) feature_end = row_length;
+    int num_features = SZ;
+    if (feature_start + num_features > row_length) {
+      num_features = row_length - feature_start;
+    }
     if (tid == 0 || sorted_data[tid - 1] != sorted_data[tid]) {
+      DType acc[SZ];
+      #pragma unroll
+      for (int i = 0; i < SZ; i++) {
+        acc[i] = 0;
+      }
+      const dim_t data = sorted_data[tid];
+      const dim_t row_id = lookup_table[data];
+      const dim_t out_offset = row_id * row_length + feature_start;
       do {
-        const dim_t data = sorted_data[tid];
         const dim_t idx = original_idx[tid];
-        const dim_t row_id = lookup_table[data];
-        const dim_t ograd_offset = idx * row_length;
-        const dim_t out_offset = row_id * row_length;
-        for (int i = feature_start; i < feature_end; i++) {
-          out[out_offset + i] += ograd[ograd_offset + i];
+        const dim_t ograd_offset = idx * row_length + feature_start;
+        for (int i = 0; i < num_features; i++) {
+          acc[i] += ograd[ograd_offset + i];
         }
         tid++;
       } while (tid < data_size && sorted_data[tid - 1] == sorted_data[tid]);
+      for (int i = 0; i < num_features; i++) {
+        out[out_offset + i] = acc[i];
+      }
     }
   }
 };
@@ -162,24 +172,109 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
   }
 }
 
+template<typename IType, typename DType, typename RType>
+void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
+                                              const TBlob& ograd,
+                                              const TBlob& data,
+                                              const OpReqType req,
+                                              const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+  const dim_t num_rows = output.shape()[0];
+  const dim_t row_length = output.shape()[1];
+  const dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  // temp resource declarations
+  dim_t* lookup_table = NULL;
+  void* temp_storage = NULL;
+  dim_t* sorted_data = NULL;
+  dim_t* original_idx = NULL;
+  // calculate number of bytes for temp resources
+  size_t lookup_table_bytes = num_rows * sizeof(dim_t);
+  size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
+  size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
+  size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
+  size_t unique_workspace_bytes = 0;
+  // estimate unique temp space
+  IType* data_ptr = data.dptr<IType>();
+  size_t *null_ptr = nullptr;
+  cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
+    null_ptr, data_size, Stream<gpu>::GetStream(s));
+  // One more space reserved for unique count
+  size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
+                                         sort_workspace_size);
+  size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
+                               original_idx_storage_bytes + temp_workspace_bytes;
+
+  // request resource and split it. layout is:
+  // lookup_table, sorted_data, original_idx, temp_storage
+  Tensor<gpu, 1, char> workspace = ctx.requested[0]
+      .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
+  lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
+  sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
+  original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
+                                          sorted_data_storage_bytes);
+  temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
+
+  // make a copy of the data, to be sorted
+  TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
+  auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
+  mxnet_op::copy(s, sorted_data_blob, data);
+
+  // generate original idx
+  Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
+  Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0),
+                                 static_cast<dim_t>(1), kWriteTo, original_idx);
+  // sort data with its original idx
+  int num_bits = ilog2(num_rows - 1);
+  char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
+  Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
+                                           Shape1(sort_workspace_size), s);
+  SortByKey(sorted_data_tensor, original_idx_tensor, true,
+            &temp_storage_tensor, 0, num_bits);
+
+  // compute unique row ids based on sorted values.
+  output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
+
+  // fill row_idx array of output matrix, using the row_flg values
+  RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+  cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data,
+      grad_row_idx, grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
+
+  dim_t nnr = 0;
+  CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
+      cudaMemcpyDeviceToHost));
+  CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
+  output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
+  output.set_aux_shape(kIdx, Shape1(nnr));
+
+  // generate lookup table
+  Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
+
+  // accumulate gradients
+  DType* grad_data = output.data().dptr<DType>();
+  const int SZ = 4;
+  const nnvm::dim_t num_threads_per_row = (row_length + SZ - 1) / SZ;
+  Kernel<AddTakeGradRspDeterministicKernel<SZ>, gpu>::Launch(s, data_size * num_threads_per_row,
+                     grad_data, lookup_table, sorted_data, data_size, original_idx,
+                     ograd.dptr<DType>(), row_length, num_threads_per_row);
+}
+
 inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
                                                           const TBlob& ograd,
                                                           const TBlob& data,
                                                           const OpReqType req,
                                                           const NDArray& output) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace expr;
-  using namespace rowsparse;
   using nnvm::dim_t;
   if (req == kNullOp) return;
   CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
                           << "weight gradient calculation with req != write";
 
-  Stream<gpu> *s = ctx.get_stream<gpu>();
-  dim_t num_rows = output.shape()[0];
-  dim_t row_length = output.shape()[1];
-  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+  const dim_t data_size = static_cast<dim_t>(data.shape_.Size());
   if (data_size == 0) {
     FillZerosRspImpl(s, output);
     return;
@@ -187,83 +282,9 @@ inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
 
   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
-      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
-        // temp resource declarations
-        dim_t* lookup_table = NULL;
-        void* temp_storage = NULL;
-        dim_t* sorted_data = NULL;
-        dim_t* original_idx = NULL;
-        // calculate number of bytes for temp resources
-        size_t lookup_table_bytes = num_rows * sizeof(dim_t);
-        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
-        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
-        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
-        size_t unique_workspace_bytes = 0;
-        // estimate unique temp space
-        IType* data_ptr = data.dptr<IType>();
-        size_t *null_ptr = nullptr;
-        cub::DeviceSelect::Unique(NULL, unique_workspace_bytes, data_ptr, data_ptr,
-          null_ptr, data_size, Stream<gpu>::GetStream(s));
-        // One more space reserved for unique count
-        size_t temp_workspace_bytes = std::max(unique_workspace_bytes,
-                                               sort_workspace_size);
-        size_t total_storage_bytes = lookup_table_bytes + sorted_data_storage_bytes +
-                                     original_idx_storage_bytes + temp_workspace_bytes;
-
-        // request resource and split it. layout is:
-        // lookup_table, sorted_data, original_idx, temp_storage
-        Tensor<gpu, 1, char> workspace = ctx.requested[0]
-            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
-        lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
-        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
-        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes +
-                                                sorted_data_storage_bytes);
-        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
-
-        // make a copy of the data, to be sorted
-        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
-        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
-        mxnet_op::copy(s, sorted_data_blob, data);
-
-        // generate original idx
-        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
-        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0),
-                                       static_cast<dim_t>(1), kWriteTo, original_idx);
-        // sort data with its original idx
-        int num_bits = ilog2(num_rows - 1);
-        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
-        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
-                                                 Shape1(sort_workspace_size), s);
-        SortByKey(sorted_data_tensor, original_idx_tensor, true,
-                  &temp_storage_tensor, 0, num_bits);
-
-        // compute unique row ids based on sorted values.
-        output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));
-
-        // fill row_idx array of output matrix, using the row_flg values
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        cub::DeviceSelect::Unique(temp_storage_ptr, unique_workspace_bytes, sorted_data,
-            grad_row_idx, grad_row_idx + data_size, data_size, Stream<gpu>::GetStream(s));
-
-        dim_t nnr = 0;
-        CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
-            cudaMemcpyDeviceToHost));
-        CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
-        output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
-        output.set_aux_shape(rowsparse::kIdx, Shape1(nnr));
-
-        // generate lookup table
-        Kernel<mark_lookup_table, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);
-
-        // accumulate gradients
-        DType* grad_data = output.data().dptr<DType>();
-        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
-            kWriteTo, 0);
-        const int SZ = 4;
-        const nnvm::dim_t num_threads_per_row = (row_length + SZ - 1) / SZ;
-        Kernel<AddTakeGradRspDeterministicKernel, gpu>::Launch(s, data_size * num_threads_per_row,
-                           grad_data, lookup_table, sorted_data, data_size, original_idx,
-                           ograd.dptr<DType>(), row_length, num_threads_per_row, SZ);
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(rowsparse::kIdx), RType, {
+        SparseEmbeddingDeterministicKernelLaunch<IType, DType, RType>(ctx, ograd, data,
+                                                                      req, output);
       });
     });
   });
@@ -277,7 +298,7 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const SparseEmbeddingParam& pa
                                                   const TBlob& data,
                                                   const OpReqType req,
                                                   const NDArray& output) {
-  if (param.force_deterministic) {
+  if (param.deterministic) {
     SparseEmbeddingOpBackwardDeterministicRspImpl(ctx, ograd, data, req, output);
     return;
   }
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index d26ed749e654..ef8f73b23f0f 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -61,7 +61,7 @@ struct SparseEmbeddingParam: public dmlc::Parameter<SparseEmbeddingParam> {
   int input_dim;
   int output_dim;
   int dtype;
-  bool force_deterministic;
+  bool deterministic;
   DMLC_DECLARE_PARAMETER(SparseEmbeddingParam) {
     DMLC_DECLARE_FIELD(input_dim).set_lower_bound(1)
     .describe("Vocabulary size of the input indices.");
@@ -74,8 +74,9 @@ struct SparseEmbeddingParam: public dmlc::Parameter<SparseEmbeddingParam> {
     .add_enum("uint8", mshadow::kUint8)
     .add_enum("int32", mshadow::kInt32)
     .describe("Data type of weight.");
-    DMLC_DECLARE_FIELD(force_deterministic).set_default(false)
-    .describe("Force the gradient computation to be executed according to a deterministic order.");
+    DMLC_DECLARE_FIELD(deterministic).set_default(false)
+    .describe("Force the backward gradient calculation to be executed based on a deterministic \
+               order at the cost of slower speed.");
   }
 };
 
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 8fea38d9c033..e0d25da03449 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1621,7 +1621,7 @@ def check_sparse_embedding(in_dim, out_dim, batch, densities, deterministic):
         data = mx.sym.Variable("data")
         weight = mx.sym.Variable("embed_weight", stype='row_sparse')
         embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
-                                               output_dim=out_dim, force_deterministic=deterministic,
+                                               output_dim=out_dim, deterministic=deterministic,
                                                name="embed")
         grad_req = {'data': 'null', 'embed_weight': 'write'}
         exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))

From 35e9b076dab01c380e92f7f87b1845f4be6fa986 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Tue, 27 Feb 2018 13:47:11 +0000
Subject: [PATCH 16/18] lint

---
 src/operator/tensor/indexing_op.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index ef8f73b23f0f..0e158ea907ca 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -75,8 +75,8 @@ struct SparseEmbeddingParam: public dmlc::Parameter<SparseEmbeddingParam> {
     .add_enum("int32", mshadow::kInt32)
     .describe("Data type of weight.");
     DMLC_DECLARE_FIELD(deterministic).set_default(false)
-    .describe("Force the backward gradient calculation to be executed based on a deterministic \
-               order at the cost of slower speed.");
+    .describe("Force the backward gradient calculation to be executed based on a deterministic"
+               " order at the cost of slower speed.");
   }
 };
 

From 6551c38de83e9063790cc7e8cdcc317c570811d0 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 28 Feb 2018 03:18:12 +0000
Subject: [PATCH 17/18] set grad to be 0s initially

---
 src/operator/tensor/indexing_op.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 85a450b90c12..5cdf5060aec4 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -109,7 +109,7 @@ struct AddTakeGradRspDeterministicKernel {
         tid++;
       } while (tid < data_size && sorted_data[tid - 1] == sorted_data[tid]);
       for (int i = 0; i < num_features; i++) {
-        out[out_offset + i] = acc[i];
+        out[out_offset + i] += acc[i];
       }
     }
   }
@@ -256,6 +256,8 @@ void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
 
   // accumulate gradients
   DType* grad_data = output.data().dptr<DType>();
+  Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+              kWriteTo, 0);
   const int SZ = 4;
   const nnvm::dim_t num_threads_per_row = (row_length + SZ - 1) / SZ;
   Kernel<AddTakeGradRspDeterministicKernel<SZ>, gpu>::Launch(s, data_size * num_threads_per_row,

From 86f3833015814f6272836eeec14646b4b1ebb7f4 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 28 Feb 2018 13:42:31 +0000
Subject: [PATCH 18/18] add warning

---
 src/operator/tensor/indexing_op.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 0e158ea907ca..45bf45f14fcd 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -243,6 +243,11 @@ inline bool SparseEmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
       dispatched = true;
     }
   }
+  const SparseEmbeddingParam& param = nnvm::get<SparseEmbeddingParam>(attrs.parsed);
+  if (param.deterministic) {
+    common::LogOnce("_SparseEmbedding_backward with deterministic=True may reduce "
+                    "speed significantly");
+  }
   return dispatched;
 }
 /*! \brief name the struct Take instead of take