diff --git a/SparseLinear.lua b/SparseLinear.lua index 77ef7c3db..6185df991 100644 --- a/SparseLinear.lua +++ b/SparseLinear.lua @@ -1,19 +1,25 @@ local THNN = require 'nn.THNN' local SparseLinear, parent = torch.class('nn.SparseLinear', 'nn.Module') -function SparseLinear:__init(inputSize, outputSize) +local NO_LAST_INPUT = 0 +local ONE_LAST_INPUT = 1 +local ACC_MULTIPLE_TIMES = 2 + +function SparseLinear:__init(inputSize, outputSize, doGradInput) parent.__init(self) self.weightDecay = 0 + self.doGradInput = doGradInput or false self.weight = torch.Tensor(outputSize, inputSize):zero() self.bias = torch.Tensor(outputSize):zero() self.gradWeight = torch.Tensor(outputSize, inputSize):zero() self.gradBias = torch.Tensor(outputSize):zero() - self.lastInput = nil - if torch.getnumthreads() > 1 and outputSize >= 128 then - self.shardBuffer = torch.Tensor(outputSize, torch.getnumthreads()) - end + assert(type(self.doGradInput) == type(true)) + + self.lastInput = nil + self.sparseUpdate = NO_LAST_INPUT + self.formatted_input = nil -- state self.gradInput:resize(inputSize) @@ -33,78 +39,148 @@ function SparseLinear:reset(stdv) end function SparseLinear:reshapeInput(input) - if input:dim() == 2 then - return input:view(1, input:size(1), input:size(2)), false + if type(input) == 'table' then + return input, true, false else - return input, true + if input:dim() == 2 then + return {input}, false, false + else + return input, true, true + end end end function SparseLinear:updateOutput(input) - self.cudaBuffer = self.cudaBuffer or input.new() - local input, batchMode = self:reshapeInput(input) - - input.THNN.SparseLinear_updateOutput( - input:cdata(), - self.output:cdata(), - self.weight:cdata(), - self.bias:cdata(), - self.cudaBuffer:cdata(), - THNN.optionalTensor(self.shardBuffer) - ) - - -- fix output size for batchSize = 1 - if not batchMode then - self.output:set(self.output:view(self.output:size(2))) - end + local input, batchMode, legacyMode = self:reshapeInput(input) + self.legacyMode = legacyMode - return self.output -end + if legacyMode then + input.THNN.SparseLinear_legacyUpdateOutput( + input:cdata(), + self.output:cdata(), + self.weight:cdata(), + self.bias:cdata() + ) + else + local nbatches = #input + if nbatches == 0 then + self.output:copy(self.bias) + return self.output + end -function SparseLinear:accGradParameters(input, gradOutput, scale) - local input, batchMode = self:reshapeInput(input) + local size = 0 + local marker = 1 + self.formatted_input = self.formatted_input or input[1].new() + + for i,v in ipairs(input) do size = size + input[i]:size(1) end + self.formatted_input:resize(size, 3) + for i,v in ipairs(input) do + local buf = self.formatted_input:narrow(1, marker, input[i]:size(1)) + buf:narrow(2,2,2):copy(input[i]) + buf:select(2,1):fill(i) + marker = marker + input[i]:size(1) + end - self.lastInput = self.lastInput or input.new() - self.lastInput:resizeAs(input):copy(input) - if not batchMode then - gradOutput = gradOutput:view(1, gradOutput:size(1)) + self.output:resize(nbatches, self.weight:size(1)) + input[1].THNN.SparseLinear_updateOutput( + self.formatted_input:cdata(), + self.output:cdata(), + self.weight:cdata(), + self.bias:cdata() + ) + + -- fix output size for batchSize = 1 + if not batchMode then + self.output = self.output[1] + end end - input.THNN.SparseLinear_accGradParameters( - input:cdata(), - gradOutput:cdata(), - self.gradWeight:cdata(), - self.gradBias:cdata(), - self.weight:cdata(), - self.bias:cdata(), - self.weightDecay or 0, - scale or 1 - ) + return self.output end -function SparseLinear:updateGradInput(input, gradOutput) - if self.gradInput then - local input, batchMode = self:reshapeInput(input) - if not batchMode then - gradOutput = gradOutput:view(1, gradOutput:size(1)) +function SparseLinear:accGradParameters(input, gradOutput, scale) + local input, batchMode, legacyMode = self:reshapeInput(input) + self.legacyMode = legacyMode + + if legacyMode then + self.lastInput = self.lastInput or input.new() + if self.sparseUpdate == NO_LAST_INPUT then + self.lastInput:resizeAs(input):copy(input) + self.sparseUpdate = ONE_LAST_INPUT + elseif self.sparseUpdate == ONE_LAST_INPUT then + self.sparseUpdate = ACC_MULTIPLE_TIMES end - input.THNN.SparseLinear_updateGradInput( + + input.THNN.SparseLinear_legacyAccGradParameters( input:cdata(), gradOutput:cdata(), - self.gradInput:cdata(), - self.weight:cdata() + self.gradWeight:cdata(), + self.gradBias:cdata(), + self.weight:cdata(), + self.bias:cdata(), + self.weightDecay or 0, + scale or 1 ) - -- fix gradInput size for batchSize = 1 + else if not batchMode then - self.gradInput:set(self.gradInput:view(self.gradInput:size(2), self.gradInput:size(3))) + gradOutput:resize(1, gradOutput:size(1)) end - return self.gradInput + input[1].THNN.SparseLinear_accGradParameters( + self.formatted_input:cdata(), + gradOutput:cdata(), + self.gradWeight:cdata(), + self.gradBias:cdata(), + self.weight:cdata(), + self.bias:cdata(), + self.weightDecay or 0, + scale or 1 + ) end end +function SparseLinear:updateGradInput(input, gradOutput) + if self.legacyMode then + if type(self.gradInput) ~= type(gradOutput) then self.gradInput = gradOutput.new() end + self.gradInput:resizeAs(input) + else + self.gradInput = {} + end + if self.doGradInput then + -- GradInput should be dense anyway + local gi + local batchMode = true + if gradOutput:dim() == 1 then + gi = self.weight:t()*gradOutput + batchMode = false + elseif gradOutput:dim() == 2 then + gi = gradOutput*self.weight + end + local ini = self.weight:size(2) + + if self.legacyMode then + local batches = self.gradInput:size(1) + self.gradInput:resize(batches, ini, 2) + self.gradInput:select(3,1):copy(torch.repeatTensor(torch.range(1, ini), batches, 1)) + self.gradInput:select(3,2):copy(gi) + else + indicies = torch.range(1, ini) + if not batchMode then gi:resize(1, ini) end + for i = 1,gi:size(1) do + self.gradInput[i] = gradOutput.new(ini, 2) + self.gradInput[i]:select(2, 2):copy(gi[i]) + self.gradInput[i]:select(2, 1):range(1, ini) + end + end + end + return self.gradInput +end + +-- These functions do sparse updates / zeros. However, if we accumulated +-- gradients multiple times, we can't depend on the last input to do sparse +-- updates. function SparseLinear:updateParameters(learningRate) - if self.lastInput then + if self.lastInput and self.legacyMode and self.sparseUpdate == ONE_LAST_INPUT then self.lastInput.THNN.SparseLinear_updateParameters( self.weight:cdata(), self.bias:cdata(), @@ -116,22 +192,24 @@ function SparseLinear:updateParameters(learningRate) else parent.updateParameters(self, learningRate) end + self.sparseUpdate = 0 end function SparseLinear:zeroGradParameters() - if self.lastInput then + if self.lastInput and self.legacyMode and self.sparseUpdate == ONE_LAST_INPUT then self.lastInput.THNN.SparseLinear_zeroGradParameters( - self.gradWeight:cdata(), - self.gradBias:cdata(), - self.lastInput:cdata() + self.gradWeight:cdata(), + self.gradBias:cdata(), + self.lastInput:cdata() ) else parent.zeroGradParameters(self) end + self.sparseUpdate = 0 end function SparseLinear:clearState() if self.lastInput then self.lastInput:set() end - if self.cudaBuffer then self.cudaBuffer:set() end + input.THNN.SparseLinear_cudaClearState() return parent.clearState(self) end diff --git a/lib/THNN/generic/SparseLinear.c b/lib/THNN/generic/SparseLinear.c index a84e03039..2e24d9192 100644 --- a/lib/THNN/generic/SparseLinear.c +++ b/lib/THNN/generic/SparseLinear.c @@ -5,15 +5,21 @@ #ifdef _OPENMP #include #endif +#include #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0]) #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1]) -static bool THNN_(checkInput)(THTensor* t) +static bool THNN_(checkLegacyInput)(THTensor* t) { return t->nDimension == 3 && t->size[2] == 2; } +static bool THNN_(checkInput)(THTensor* t) +{ + return t->nDimension == 2 && t->size[1] == 3; +} + static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1) { return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; @@ -41,15 +47,61 @@ void THNN_(SparseLinear_updateOutput)( THTensor *input, THTensor *output, THTensor *weight, - THTensor *bias, - THTensor *cudaBuffer, - THTensor *shardBuffer) + THTensor *bias) +{ + long h, i; + long outDim = THTensor_(size)(weight, 0); + long inDim = THTensor_(size)(weight, 1); + long batchSize = THTensor_(size)(output, 0); + + THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); + + long nnz = THTensor_(size)(input, 0); + + // output = weight * input + bias + THTensor_(zero)(output); +#pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000) + for (i = 0; i < nnz; i++) { + real val = THNN_(get2d)(input, i, 2); + if (val == 0) { + continue; + } + + long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; + long h = (long)(THNN_(get2d)(input, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + COL_PTR2(weight, offset), weight->stride[0], + ROW_PTR2(output, h), output->stride[1]); + } else { + THError("index out of bound. updateOutput: %d not between 1 and %d", + offset + 1, inDim); + } + } + + THTensor* output_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(output_row, output, 0, h); + THTensor_(cadd)(output_row, bias, 1.0, output_row); + } + THTensor_(free)(output_row); +} + +void THNN_(SparseLinear_legacyUpdateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias) { long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); - THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2"); + THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); @@ -105,6 +157,65 @@ void THNN_(SparseLinear_accGradParameters)( long inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, + "input must be in coo format, nnz x 3"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, + "gradBias size wrong"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 1, + "gradOutput must be contiguous"); + + long nnz = THTensor_(size)(input, 0); + // THTensor_(resize2d)(gradOutput, batchSize, outDim); + + // gradWeight += gradOutput * input +#pragma omp parallel for private(h, i) schedule(static) if (\ + nnz * outDim > 10000) + for (i = 0; i < nnz; i++) { + real val = scale * THNN_(get2d)(input, i, 2); + + long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; + long h = (long)(THNN_(get2d)(input, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + ROW_PTR2(gradOutput, h), gradOutput->stride[1], + COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + } else { + THError( + "index out of bound. accGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + + // gradBias += gradOutput + THTensor* buf = THTensor_(new)(); + THTensor_(sum)(buf, gradOutput, 0); + THTensor_(cadd)(gradBias, gradBias, scale, buf); + THTensor_(free)(buf); + + if (weightDecay != 0) { + THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); + } +} + +void THNN_(SparseLinear_legacyAccGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + real weightDecay, + real scale) +{ + long h, i; + long outDim = THTensor_(size)(weight, 0); + long inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); @@ -279,51 +390,7 @@ void THNN_(SparseLinear_zeroGradParameters)( } } -void THNN_(SparseLinear_updateGradInput)( - THNNState *state, - THTensor *input, - THTensor *gradOutput, - THTensor *gradInput, - THTensor *weight) -{ - long h, i; - long outDim = weight->size[0]; - long inDim = weight->size[1]; - - THArgCheck(THNN_(checkInput)(input), 2, - "input must be a batchSize x nnz x 2 tensor"); - THArgCheck(THTensor_(isContiguous)(gradInput), 4, - "gradInput must be contiguous"); - THArgCheck(THTensor_(isContiguous)(gradOutput), 3, - "gradOutput must be contiguous"); - - long batchSize = THTensor_(size)(input, 0); - long nnz = THTensor_(size)(input, 1); - THTensor_(resize2d)(gradOutput, batchSize, outDim); - THTensor_(resize3d)(gradInput, batchSize, nnz, 2); - -#pragma omp parallel for private(h, i) schedule(static) if ( \ - batchSize > 1 && batchSize * nnz * outDim > 10000) - for (h = 0; h < batchSize; h++) { - for (i = 0; i < nnz; ++i) { - long offset = (long)(THTensor_(get3d)(input, h, i, 0)) - 1; - THTensor_(set3d)(gradInput, h, i, 0, offset + 1); - - if (offset >= 0 && offset < inDim) { - real val = THBlas_(dot)( - outDim, - ROW_PTR2(gradOutput, h), gradOutput->stride[1], - COL_PTR2(weight, offset), weight->stride[0]); - THTensor_(set3d)(gradInput, h, i, 1, val); - } else { - THError( - "index out of bound. updateGradInput: %d not between 1 and %d", - offset + 1, - inDim); - } - } - } -} +void THNN_(SparseLinear_cudaClearState)(THNNState *state) {} #undef ROW_PTR2 #undef COL_PTR2 diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h index 86c63da12..544d317dd 100644 --- a/lib/THNN/generic/THNN.h +++ b/lib/THNN/generic/THNN.h @@ -342,16 +342,24 @@ TH_API void THNN_(SparseLinear_updateOutput)( THTensor *input, THTensor *output, THTensor *weight, - THTensor *bias, - THTensor *cudaBuffer, - THTensor *shardBuffer); -TH_API void THNN_(SparseLinear_updateGradInput)( + THTensor *bias); +TH_API void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, - THTensor *gradInput, - THTensor *weight); -TH_API void THNN_(SparseLinear_accGradParameters)( + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + real weightDecay, + real scale); +TH_API void THNN_(SparseLinear_legacyUpdateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias); +TH_API void THNN_(SparseLinear_legacyAccGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, @@ -374,6 +382,7 @@ TH_API void THNN_(SparseLinear_updateParameters)( THTensor *gradBias, THTensor *lastInput, real learningRate); +TH_API void THNN_(SparseLinear_cudaClearState)(THNNState *state); TH_API void THNN_(Sqrt_updateOutput)( THNNState *state, diff --git a/test.lua b/test.lua index a5c287cfd..758806930 100644 --- a/test.lua +++ b/test.lua @@ -762,68 +762,71 @@ function nntest.Linear() end function nntest.SparseLinear() + local inb = math.random(5,10) local ini = math.random(50,100) local inj = math.random(5,10) local numNonzero = math.random(3,5) - local module = nn.SparseLinear(ini,inj) + local module = nn.SparseLinear(ini,inj, true) + local linear = nn.Linear(ini, inj) + linear.weight = module.weight:clone() + linear.bias = module.bias:clone() + module:zeroGradParameters() + linear:zeroGradParameters() -- Create a random sparse vector - local N = {} - for i = 1, ini do N[i] = i end - for i = 1, numNonzero do - local j = math.random(i,ini) - N[i], N[j] = N[j], N[i] - end - local input = torch.Tensor(numNonzero, 2):zero() - for i = 1, numNonzero do input[{i,1}] = N[i] end - local values = input:select(2,2) - values:copy(torch.rand(values:nElement())):mul(2):add(-1) - - -- Check output - local actual = module:forward(input) - local expected = torch.Tensor(inj) - for j = 1, inj do - expected[j] = 0 - for i = 1,numNonzero do - expected[j] = expected[j] + values[i] * module.weight[{j, N[i]}] - end + local input = {} + local nonsparse = torch.zeros(inb, ini) + for i=1,inb do + local nnz = math.random(1, 3) + local inds = torch.randperm(ini)[{{1,nnz}}] + input[i] = torch.Tensor(nnz, 2) + input[i]:select(2,1):copy(inds) + input[i]:select(2,2):copy(torch.ones(nnz)) + nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2)) end + local gradOutput = torch.rand(inb, inj) + + -- Check output wrt linear, non-batch + local actual = module:forward(input[1]) + local expected = linear:forward(nonsparse[1]) + local actualgi = module:backward(input[1], gradOutput[1]) + local expectedgi = linear:backward(nonsparse[1], gradOutput[1]) + module:updateParameters(1) + linear:updateParameters(1) + cmps = {'weight', 'bias', 'gradWeight', 'gradBias'} local err = (expected - actual):abs():max() + local gierr = (expectedgi - actualgi[1]:select(2,2)):abs():max() mytester:assertle(err, precision, 'error on result') - - -- Jacobian 1D - local err = sjac.testJacobian(module,input) - mytester:assertlt(err,precision, 'error on state ') - - local err = sjac.testJacobianParameters(module, input, module.weight, module.gradWeight) - mytester:assertlt(err,precision, 'error on weight ') - - local err = sjac.testJacobianParameters(module, input, module.bias, module.gradBias) - mytester:assertlt(err,precision, 'error on bias ') - - local err = sjac.testJacobianUpdateParameters(module, input, module.weight) - mytester:assertlt(err,precision, 'error on weight [direct update] ') - - local err = sjac.testJacobianUpdateParameters(module, input, module.bias) - mytester:assertlt(err,precision, 'error on bias [direct update] ') - - for t,err in pairs(sjac.testAllUpdate(module, input, 'weight', 'gradWeight')) do - mytester:assertlt(err, precision, string.format( - 'error on weight [%s]', t)) + mytester:assertle(gierr, precision, 'error on gradInput') + + for _,var in ipairs(cmps) do + local err = (module[var] - linear[var]):abs():max() + mytester:assertle(err, precision, 'error on '..var) end - for t,err in pairs(sjac.testAllUpdate(module, input, 'bias', 'gradBias')) do - mytester:assertlt(err, precision, string.format( - 'error on bias [%s]', t)) + -- Check output wrt linear, batch + local actual = module:forward(input) + local expected = linear:forward(nonsparse) + local actualgi = module:backward(input, gradOutput) + local expectedgi = linear:backward(nonsparse, gradOutput) + module:updateParameters(1) + linear:updateParameters(1) + cmps = {'weight', 'bias', 'gradWeight', 'gradBias'} + local err = (expected - actual):abs():max() + local gicheck = torch.Tensor():resizeAs(expectedgi) + for i=1,#actualgi do gicheck[i]:copy(actualgi[i]:select(2,2)) end + local gierr = (expectedgi - gicheck):abs():max() + mytester:assertle(err, precision, 'error on result') + mytester:assertle(gierr, precision, 'error on gradInput') + + for _,var in ipairs(cmps) do + local err = (module[var] - linear[var]):abs():max() + mytester:assertle(err, precision, 'error on '..var) end - local ferr, berr = sjac.testIO(module, input) - mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ') - mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ') - - -- batch mode - local batch = math.random(1,5) + -- legacy batch mode + local batch = math.random(2,5) local input = torch.Tensor(batch, numNonzero, 2):zero() for k=1,batch do @@ -837,6 +840,7 @@ function nntest.SparseLinear() end local values = input:select(3,2) values:copy(torch.rand(values:nElement())):mul(2):add(-1) + -- Check output local actual = module:forward(input):clone() local expected = torch.Tensor(batch, inj) @@ -845,16 +849,6 @@ function nntest.SparseLinear() end local err = (expected - actual):abs():max() mytester:assertle(err, precision, 'error on batch result forward') - local gradOutput = actual:clone():normal() - module:forward(input) - local actualG = module:backward(input, gradOutput):clone() - local expectedG = actualG:clone():zero() - for k = 1, batch do - module:forward(input[k]) - expectedG[k]:copy(module:backward(input[k], gradOutput[k])) - end - err = (expectedG - actualG):abs():max() - mytester:assertle(err, precision, 'error on batch result forward') end function nntest.Bilinear()