diff --git a/SparseLinear.lua b/SparseLinear.lua
index 77ef7c3db..6185df991 100644
--- a/SparseLinear.lua
+++ b/SparseLinear.lua
@@ -1,19 +1,25 @@
 local THNN = require 'nn.THNN'
 local SparseLinear, parent = torch.class('nn.SparseLinear', 'nn.Module')
 
-function SparseLinear:__init(inputSize, outputSize)
+local NO_LAST_INPUT = 0
+local ONE_LAST_INPUT = 1
+local ACC_MULTIPLE_TIMES = 2
+
+function SparseLinear:__init(inputSize, outputSize, doGradInput)
    parent.__init(self)
 
    self.weightDecay = 0
+   self.doGradInput = doGradInput or false
    self.weight = torch.Tensor(outputSize, inputSize):zero()
    self.bias = torch.Tensor(outputSize):zero()
    self.gradWeight = torch.Tensor(outputSize, inputSize):zero()
    self.gradBias = torch.Tensor(outputSize):zero()
-   self.lastInput = nil
 
-   if torch.getnumthreads() > 1 and outputSize >= 128 then
-      self.shardBuffer = torch.Tensor(outputSize, torch.getnumthreads())
-   end
+   assert(type(self.doGradInput) == type(true))
+
+   self.lastInput = nil
+   self.sparseUpdate = NO_LAST_INPUT
+   self.formatted_input = nil
 
    -- state
    self.gradInput:resize(inputSize)
@@ -33,78 +39,148 @@ function SparseLinear:reset(stdv)
 end
 
 function SparseLinear:reshapeInput(input)
-   if input:dim() == 2 then
-      return input:view(1, input:size(1), input:size(2)), false
+   if type(input) == 'table' then
+      return input, true, false
    else
-      return input, true
+      if input:dim() == 2 then
+         return {input}, false, false
+      else
+         return input, true, true
+      end
    end
 end
 
 function SparseLinear:updateOutput(input)
-   self.cudaBuffer = self.cudaBuffer or input.new()
-   local input, batchMode = self:reshapeInput(input)
-
-   input.THNN.SparseLinear_updateOutput(
-      input:cdata(),
-      self.output:cdata(),
-      self.weight:cdata(),
-      self.bias:cdata(),
-      self.cudaBuffer:cdata(),
-      THNN.optionalTensor(self.shardBuffer)
-   )
-
-   -- fix output size for batchSize = 1
-   if not batchMode then
-      self.output:set(self.output:view(self.output:size(2)))
-   end
+   local input, batchMode, legacyMode = self:reshapeInput(input)
+   self.legacyMode = legacyMode
 
-   return self.output
-end
+   if legacyMode then 
+      input.THNN.SparseLinear_legacyUpdateOutput(
+         input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata()
+      )
+   else
+      local nbatches = #input
+      if nbatches == 0 then
+         self.output:copy(self.bias)
+         return self.output
+      end
 
-function SparseLinear:accGradParameters(input, gradOutput, scale)
-   local input, batchMode = self:reshapeInput(input)
+      local size = 0
+      local marker = 1
+      self.formatted_input = self.formatted_input or input[1].new()
+
+      for i,v in ipairs(input) do size = size + input[i]:size(1) end
+      self.formatted_input:resize(size, 3)
+      for i,v in ipairs(input) do
+         local buf = self.formatted_input:narrow(1, marker, input[i]:size(1))
+         buf:narrow(2,2,2):copy(input[i])
+         buf:select(2,1):fill(i)
+         marker = marker + input[i]:size(1)
+      end
 
-   self.lastInput = self.lastInput or input.new()
-   self.lastInput:resizeAs(input):copy(input)
-   if not batchMode then
-      gradOutput = gradOutput:view(1, gradOutput:size(1))
+      self.output:resize(nbatches, self.weight:size(1))
+      input[1].THNN.SparseLinear_updateOutput(
+         self.formatted_input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata()
+      )
+
+      -- fix output size for batchSize = 1
+      if not batchMode then
+         self.output = self.output[1]
+      end
    end
 
-   input.THNN.SparseLinear_accGradParameters(
-      input:cdata(),
-      gradOutput:cdata(),
-      self.gradWeight:cdata(),
-      self.gradBias:cdata(),
-      self.weight:cdata(),
-      self.bias:cdata(),
-      self.weightDecay or 0,
-      scale or 1
-   )
+   return self.output
 end
 
-function SparseLinear:updateGradInput(input, gradOutput)
-   if self.gradInput then
-      local input, batchMode = self:reshapeInput(input)
-      if not batchMode then
-         gradOutput = gradOutput:view(1, gradOutput:size(1))
+function SparseLinear:accGradParameters(input, gradOutput, scale)
+   local input, batchMode, legacyMode = self:reshapeInput(input)
+   self.legacyMode = legacyMode
+
+   if legacyMode then
+      self.lastInput = self.lastInput or input.new()
+      if self.sparseUpdate == NO_LAST_INPUT then
+         self.lastInput:resizeAs(input):copy(input)
+         self.sparseUpdate = ONE_LAST_INPUT
+      elseif self.sparseUpdate == ONE_LAST_INPUT then
+         self.sparseUpdate = ACC_MULTIPLE_TIMES
       end
-      input.THNN.SparseLinear_updateGradInput(
+
+      input.THNN.SparseLinear_legacyAccGradParameters(
          input:cdata(),
          gradOutput:cdata(),
-         self.gradInput:cdata(),
-         self.weight:cdata()
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.weightDecay or 0,
+         scale or 1
       )
-      -- fix gradInput size for batchSize = 1
+   else
       if not batchMode then
-         self.gradInput:set(self.gradInput:view(self.gradInput:size(2), self.gradInput:size(3)))
+         gradOutput:resize(1, gradOutput:size(1))
       end
 
-      return self.gradInput
+      input[1].THNN.SparseLinear_accGradParameters(
+         self.formatted_input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.weightDecay or 0,
+         scale or 1
+      )
    end
 end
 
+function SparseLinear:updateGradInput(input, gradOutput)
+   if self.legacyMode then 
+      if type(self.gradInput) ~= type(gradOutput) then self.gradInput = gradOutput.new() end
+      self.gradInput:resizeAs(input)
+   else
+      self.gradInput = {}
+   end
+   if self.doGradInput then
+      -- GradInput should be dense anyway
+      local gi
+      local batchMode = true
+      if gradOutput:dim() == 1 then
+         gi = self.weight:t()*gradOutput
+         batchMode = false
+      elseif gradOutput:dim() == 2 then
+         gi = gradOutput*self.weight
+      end
+      local ini = self.weight:size(2)
+
+      if self.legacyMode then
+         local batches = self.gradInput:size(1)
+         self.gradInput:resize(batches, ini, 2)
+         self.gradInput:select(3,1):copy(torch.repeatTensor(torch.range(1, ini), batches, 1))
+         self.gradInput:select(3,2):copy(gi)
+      else
+         indicies = torch.range(1, ini)
+         if not batchMode then gi:resize(1, ini) end
+         for i = 1,gi:size(1) do
+            self.gradInput[i] = gradOutput.new(ini, 2)
+            self.gradInput[i]:select(2, 2):copy(gi[i])
+            self.gradInput[i]:select(2, 1):range(1, ini)
+         end
+      end
+   end
+   return self.gradInput
+end
+
+-- These functions do sparse updates / zeros. However, if we accumulated 
+-- gradients multiple times, we can't depend on the last input to do sparse
+-- updates.
 function SparseLinear:updateParameters(learningRate)
-   if self.lastInput then
+   if self.lastInput and self.legacyMode and self.sparseUpdate == ONE_LAST_INPUT then
       self.lastInput.THNN.SparseLinear_updateParameters(
          self.weight:cdata(),
          self.bias:cdata(),
@@ -116,22 +192,24 @@ function SparseLinear:updateParameters(learningRate)
    else
       parent.updateParameters(self, learningRate)
    end
+   self.sparseUpdate = 0
 end
 
 function SparseLinear:zeroGradParameters()
-   if self.lastInput then
+   if self.lastInput and self.legacyMode and self.sparseUpdate == ONE_LAST_INPUT then
       self.lastInput.THNN.SparseLinear_zeroGradParameters(
-         self.gradWeight:cdata(),
-         self.gradBias:cdata(),
-         self.lastInput:cdata()
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.lastInput:cdata()
       )
    else
       parent.zeroGradParameters(self)
    end
+   self.sparseUpdate = 0
 end
 
 function SparseLinear:clearState()
    if self.lastInput then self.lastInput:set() end
-   if self.cudaBuffer then self.cudaBuffer:set() end
+   input.THNN.SparseLinear_cudaClearState()
    return parent.clearState(self)
 end
diff --git a/lib/THNN/generic/SparseLinear.c b/lib/THNN/generic/SparseLinear.c
index a84e03039..2e24d9192 100644
--- a/lib/THNN/generic/SparseLinear.c
+++ b/lib/THNN/generic/SparseLinear.c
@@ -5,15 +5,21 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+#include <stdio.h>
 
 #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
 #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
 
-static bool THNN_(checkInput)(THTensor* t)
+static bool THNN_(checkLegacyInput)(THTensor* t)
 {
   return t->nDimension == 3 && t->size[2] == 2;
 }
 
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
 static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
 {
   return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
@@ -41,15 +47,61 @@ void THNN_(SparseLinear_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
-          THTensor *cudaBuffer,
-          THTensor *shardBuffer)
+          THTensor *bias)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+  long batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(input, i, 2);
+    if (val == 0) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THBlas_(axpy)(outDim,
+                    val,
+                    COL_PTR2(weight, offset), weight->stride[0],
+                    ROW_PTR2(output, h), output->stride[1]);
+    } else {
+      THError("index out of bound. updateOutput: %d not between 1 and %d",
+              offset + 1, inDim);
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
 {
   long h, i;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
 
-  THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 
@@ -105,6 +157,65 @@ void THNN_(SparseLinear_accGradParameters)(
   long inDim = THTensor_(size)(weight, 1);
 
   THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long nnz = THTensor_(size)(input, 0);
+  // THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    real val = scale * THNN_(get2d)(input, i, 2);
+
+    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THBlas_(axpy)(outDim,
+          val,
+          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+          COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+    } else {
+      THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
              "input size must be batchsize x nnz x 2");
   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
@@ -279,51 +390,7 @@ void THNN_(SparseLinear_zeroGradParameters)(
   }
 }
 
-void THNN_(SparseLinear_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *weight)
-{
-  long h, i;
-  long outDim = weight->size[0];
-  long inDim = weight->size[1];
-
-  THArgCheck(THNN_(checkInput)(input), 2,
-             "input must be a batchSize x nnz x 2 tensor");
-  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
-             "gradInput must be contiguous");
-  THArgCheck(THTensor_(isContiguous)(gradOutput), 3,
-             "gradOutput must be contiguous");
-
-  long batchSize = THTensor_(size)(input, 0);
-  long nnz = THTensor_(size)(input, 1);
-  THTensor_(resize2d)(gradOutput, batchSize, outDim);
-  THTensor_(resize3d)(gradInput, batchSize, nnz, 2);
-
-#pragma omp parallel for private(h, i) schedule(static) if (    \
-  batchSize > 1 && batchSize * nnz * outDim > 10000)
-  for (h = 0; h < batchSize; h++) {
-    for (i = 0; i < nnz; ++i) {
-      long offset = (long)(THTensor_(get3d)(input, h, i, 0)) - 1;
-      THTensor_(set3d)(gradInput, h, i, 0, offset + 1);
-
-      if (offset >= 0 && offset < inDim) {
-        real val = THBlas_(dot)(
-          outDim,
-          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
-          COL_PTR2(weight, offset), weight->stride[0]);
-        THTensor_(set3d)(gradInput, h, i, 1, val);
-      } else {
-        THError(
-          "index out of bound. updateGradInput: %d not between 1 and %d",
-          offset + 1,
-          inDim);
-      }
-    }
-  }
-}
+void THNN_(SparseLinear_cudaClearState)(THNNState *state) {}
 
 #undef ROW_PTR2
 #undef COL_PTR2
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 86c63da12..544d317dd 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -342,16 +342,24 @@ TH_API void THNN_(SparseLinear_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
-          THTensor *cudaBuffer,
-          THTensor *shardBuffer);
-TH_API void THNN_(SparseLinear_updateGradInput)(
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *weight);
-TH_API void THNN_(SparseLinear_accGradParameters)(
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
@@ -374,6 +382,7 @@ TH_API void THNN_(SparseLinear_updateParameters)(
           THTensor *gradBias,
           THTensor *lastInput,
           real learningRate);
+TH_API void THNN_(SparseLinear_cudaClearState)(THNNState *state);
 
 TH_API void THNN_(Sqrt_updateOutput)(
           THNNState *state,
diff --git a/test.lua b/test.lua
index a5c287cfd..758806930 100644
--- a/test.lua
+++ b/test.lua
@@ -762,68 +762,71 @@ function nntest.Linear()
 end
 
 function nntest.SparseLinear()
+   local inb = math.random(5,10)
    local ini = math.random(50,100)
    local inj = math.random(5,10)
    local numNonzero = math.random(3,5)
 
-   local module = nn.SparseLinear(ini,inj)
+   local module = nn.SparseLinear(ini,inj, true)
+   local linear = nn.Linear(ini, inj)
+   linear.weight = module.weight:clone()
+   linear.bias = module.bias:clone()
+   module:zeroGradParameters()
+   linear:zeroGradParameters()
 
    -- Create a random sparse vector
-   local N = {}
-   for i = 1, ini do N[i] = i end
-   for i = 1, numNonzero do
-      local j = math.random(i,ini)
-      N[i], N[j] = N[j], N[i]
-   end
-   local input = torch.Tensor(numNonzero, 2):zero()
-   for i = 1, numNonzero do input[{i,1}] = N[i] end
-   local values = input:select(2,2)
-   values:copy(torch.rand(values:nElement())):mul(2):add(-1)
-
-   -- Check output
-   local actual = module:forward(input)
-   local expected = torch.Tensor(inj)
-   for j = 1, inj do
-      expected[j] = 0
-      for i = 1,numNonzero do
-         expected[j] = expected[j] + values[i] * module.weight[{j, N[i]}]
-      end
+   local input = {}
+   local nonsparse = torch.zeros(inb, ini)
+   for i=1,inb do
+       local nnz = math.random(1, 3)
+       local inds = torch.randperm(ini)[{{1,nnz}}]
+       input[i] = torch.Tensor(nnz, 2)
+       input[i]:select(2,1):copy(inds)
+       input[i]:select(2,2):copy(torch.ones(nnz))
+       nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
    end
+   local gradOutput = torch.rand(inb, inj)
+
+   -- Check output wrt linear, non-batch
+   local actual = module:forward(input[1])
+   local expected = linear:forward(nonsparse[1])
+   local actualgi = module:backward(input[1], gradOutput[1])
+   local expectedgi = linear:backward(nonsparse[1], gradOutput[1])
+   module:updateParameters(1)
+   linear:updateParameters(1)
+   cmps = {'weight', 'bias', 'gradWeight', 'gradBias'}
    local err = (expected - actual):abs():max()
+   local gierr = (expectedgi - actualgi[1]:select(2,2)):abs():max()
    mytester:assertle(err, precision, 'error on result')
-
-   -- Jacobian 1D
-   local err = sjac.testJacobian(module,input)
-   mytester:assertlt(err,precision, 'error on state ')
-
-   local err = sjac.testJacobianParameters(module, input, module.weight, module.gradWeight)
-   mytester:assertlt(err,precision, 'error on weight ')
-
-   local err = sjac.testJacobianParameters(module, input, module.bias, module.gradBias)
-   mytester:assertlt(err,precision, 'error on bias ')
-
-   local err = sjac.testJacobianUpdateParameters(module, input, module.weight)
-   mytester:assertlt(err,precision, 'error on weight [direct update] ')
-
-   local err = sjac.testJacobianUpdateParameters(module, input, module.bias)
-   mytester:assertlt(err,precision, 'error on bias [direct update] ')
-
-   for t,err in pairs(sjac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on weight [%s]', t))
+   mytester:assertle(gierr, precision, 'error on gradInput')
+   
+   for _,var in ipairs(cmps) do
+        local err = (module[var] - linear[var]):abs():max()
+        mytester:assertle(err, precision, 'error on '..var)
    end
 
-   for t,err in pairs(sjac.testAllUpdate(module, input, 'bias', 'gradBias')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on bias [%s]', t))
+   -- Check output wrt linear, batch
+   local actual = module:forward(input)
+   local expected = linear:forward(nonsparse)
+   local actualgi = module:backward(input, gradOutput)
+   local expectedgi = linear:backward(nonsparse, gradOutput)
+   module:updateParameters(1)
+   linear:updateParameters(1)
+   cmps = {'weight', 'bias', 'gradWeight', 'gradBias'}
+   local err = (expected - actual):abs():max()
+   local gicheck = torch.Tensor():resizeAs(expectedgi)
+   for i=1,#actualgi do gicheck[i]:copy(actualgi[i]:select(2,2)) end
+   local gierr = (expectedgi - gicheck):abs():max()
+   mytester:assertle(err, precision, 'error on result')
+   mytester:assertle(gierr, precision, 'error on gradInput')
+   
+   for _,var in ipairs(cmps) do
+        local err = (module[var] - linear[var]):abs():max()
+        mytester:assertle(err, precision, 'error on '..var)
    end
 
-   local ferr, berr = sjac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
-
-   -- batch mode
-   local batch = math.random(1,5)
+   -- legacy batch mode
+   local batch = math.random(2,5)
 
    local input = torch.Tensor(batch, numNonzero, 2):zero()
    for k=1,batch do
@@ -837,6 +840,7 @@ function nntest.SparseLinear()
    end
    local values = input:select(3,2)
    values:copy(torch.rand(values:nElement())):mul(2):add(-1)
+
    -- Check output
    local actual = module:forward(input):clone()
    local expected = torch.Tensor(batch, inj)
@@ -845,16 +849,6 @@ function nntest.SparseLinear()
    end
    local err = (expected - actual):abs():max()
    mytester:assertle(err, precision, 'error on batch result forward')
-   local gradOutput = actual:clone():normal()
-   module:forward(input)
-   local actualG = module:backward(input, gradOutput):clone()
-   local expectedG = actualG:clone():zero()
-   for k = 1, batch do
-      module:forward(input[k])
-      expectedG[k]:copy(module:backward(input[k], gradOutput[k]))
-   end
-   err = (expectedG - actualG):abs():max()
-   mytester:assertle(err, precision, 'error on batch result forward')
 end
 
 function nntest.Bilinear()