-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathL1CriterionModule.lua
More file actions
185 lines (153 loc) · 10.5 KB
/
L1CriterionModule.lua
File metadata and controls
185 lines (153 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
-- Wrap a criterion, for which only a single input comes from the network, into a module. The criterion can also take a fixed target, specified by setTarget. DEBUG_MODE, presently disable, potentially outputs diagnostic messages during each operation.
local L1CriterionModule, parent = torch.class('nn.L1CriterionModule', 'nn.Module')
function L1CriterionModule:__init(criterion, initial_lambda, desired_criterion_value, learning_rate_scaling_factor, exempt_max)
self.criterion = criterion
self.criterion_output = 0
--self.gradInput = criterion.gradInput
self.gradInput = torch.Tensor()
-- only expose the scaling factor lambda as a weight if we want it to be trained; otherwise, it is subject to manipulations like weight decay by optim modules (like sgd) that violate abstraction barriers
if desired_criterion_value then
self.weight = torch.Tensor(1)
self.gradWeight = torch.Tensor(1)
self.weight[1] = initial_lambda or 1
end
self.lambda = initial_lambda or 1
self.lambda_scaling = 1 -- TEST CODE FOR AUTO-CATEGORICAL UNITS
self.desired_criterion_value = desired_criterion_value
self.learning_rate_scaling_factor = learning_rate_scaling_factor or 1
self.exempt_max = exempt_max
end
-- this is required for ClassNLLCriterion
function L1CriterionModule:setTarget(target) -- the target need not be set if the criterion's updateOutput and updateGradOutput only take the single argument input; target is then nil by default, and ignored
self.target = target
end
function L1CriterionModule:reset_lambda(new_lambda)
self.lambda = new_lambda
if self.weight then
self.weight[1] = self.lambda
end
end
function L1CriterionModule:updateOutput(input)
if self.weight then
self.lambda = self.weight[1]
end
-- find max; use L1Cost.c; add back in max value and save index for gradient calculation
self.criterion_output = self.criterion:updateOutput(input, self.target) -- self.target is ignored by L1Cost, but used by ClassNLLCriterion
-- use L1 reweighted in proportion to the magnitude of each unit, so that units near 0 are subject to a full L1 norm, whereas the most active units are subject to a greatly reduced L1 norm: \sum_i (1 - |z_i|/(\sum_i |z_i|)) * |z_i| = \sum_i |z_i| - (\sum_i |z_i|^2)/(\sum_i |z_i|)
if self.exempt_max and (self.exempt_max < 1) then
self.L1_norm_vec = self.L1_norm_vec or torch.Tensor()
self.L2_norm_vec = self.L2_norm_vec or torch.Tensor()
self.abs_input = self.abs_input or torch.Tensor()
self.abs_input_squared = self.abs_input_squared or torch.Tensor()
--[[ RESIZING IS DONE AUTOMATICALLY!!!
local norm_vec_size = input:size().new(input:size():size() - 1) -- create a new Storage with dimensionality one less than the Storage holding the size of input
for i = 1,input:size():size() - 1 do -- create a size Storage that matches the size Storage of input up to the last dimension
norm_vec_size[i] = input:size()[i]
end
self.L1_norm_vec:resize(norm_vec_size)
self.L2_norm_vec:resize(norm_vec_size)
self.output_vector_size = input:size(input:dim()) -- number of elements in the last dimension of the input, which indexes over hidden units rather than batches
--]]
self.abs_input:resizeAs(input):copy(input):abs()
self.abs_input_squared:resizeAs(input):copy(self.abs_input):pow(2)
torch.sum(self.L1_norm_vec, self.abs_input, input:dim()) -- resize automatically; keep in mind that the dimension over which the sum is performed is still present, but has extent of 1
torch.sum(self.L2_norm_vec, self.abs_input_squared, input:dim())
if input:dim() == 2 then
self.L1_norm_vec = self.L1_norm_vec:select(input:dim(),1) -- eliminate the vestigial dimension -- not necessary if we sum
self.L2_norm_vec = self.L2_norm_vec:select(input:dim(),1)
end
self.L2_norm_vec:cdiv(self.L1_norm_vec) --:mul(input:size(input:dim())) -- \sum_i |z_i|^2 / (\sum_j |z_j|)
--print(self.L1_norm_vec, self.L2_norm_vec)
---[[
if math.abs(self.criterion_output - torch.sum(self.L1_norm_vec)) > 1e-5 then
error('WARNING!!! ' .. self.criterion_output .. ' ~= ' .. torch.sum(self.L1_norm_vec))
end
--]]
self.criterion_output = torch.sum(self.L1_norm_vec) - torch.sum(self.L2_norm_vec) -- a scalar is returned when torch.sum operates over all dimensions
-- prepare for updateGradInput
self.L2_norm_vec:cdiv(self.L1_norm_vec) -- compute n * (\sum_i |z_i|^2) / (\sum_i |z_i|)^2, since we've already put n * (\sum_i |z_i|^2) / (\sum_i |z_i|) in L2_norm_vec
--[[
local max_vals, max_indices = input:max(input:size(input:dim()))
local min_vals, min_indices = input:min(input:size(input:dim()))
if max_vals:dim() == 2 then
max_vals = max_vals:select(2,1)
min_vals = min_vals:select(2,1)
end
self.extreme_elements = self.extreme_elements or torch.Tensor()
self.extreme_elements:resize(max_vals:size(1), 2)
self.extreme_elements:select(2,1):copy(max_vals)
self.extreme_elements:select(2,2):copy(min_vals)
local abs_max_vals, abs_max_signs = self.extreme_elements:abs():max(2)
self.criterion_output = criterion_output - self.abs_max_vals:sum()
--]]
-- use a^n * \sum_i (1/a - |z_i|/(\sum_j |z_j|))^n * |z_i| = \sum_i (1 - a * |z_i| / (\sum_j |z_j|))^n * |z_i|; n = exempt_max; a = internal_factor
elseif self.exempt_max and (self.exempt_max >= 1) then
self.L1_norm_vec = self.L1_norm_vec or torch.Tensor() -- \sum_i |z_i| for each batch
self.L2_norm_vec = self.L2_norm_vec or torch.Tensor() -- (\sum_i |z_i|^2) for each batch
self.abs_input = self.abs_input or torch.Tensor()
self.abs_input_squared = self.abs_input_squared or torch.Tensor()
self.abs_input:resizeAs(input):copy(input):abs() -- make |z_i|
self.abs_input_squared:resizeAs(input):copy(self.abs_input):pow(2) -- make |z_i|^2
-- L1_norm_vec: vector_batch \sum_i |z_i|
torch.sum(self.L1_norm_vec, self.abs_input, input:dim()) -- resize automatically; keep in mind that the dimension over which the sum is performed is still present, but has extent of 1
if input:dim() == 2 then
self.L1_norm_vec = self.L1_norm_vec:select(input:dim(),1) -- eliminate the vestigial dimension -- not necessary if we sum
end
-- scaling term: vector_i of (1 - |z_i| / (\sum_j |z_j|))
local internal_factor = 4
self.lambda_scaling = internal_factor^self.exempt_max
local scaling_term = torch.cmul(self.abs_input, torch.ger(torch.pow(self.L1_norm_vec, -1), torch.ones(input:size(input:dim())))):mul(-1):add(1/internal_factor) -- (1/a - |z_i| / (\sum_j |z_j|))
local thresh = -math.pow(0.4, 1/self.exempt_max) / internal_factor -- -0.1
scaling_term:maxN(thresh)
self.scaling_term_pow_n = torch.pow(scaling_term, self.exempt_max) -- (1 - |z_i| / (\sum_j |z_j|))^n
self.scaling_term_pow_n_minus_1 = torch.pow(scaling_term, self.exempt_max-1) -- (1 - |z_i| / (\sum_j |z_j|))^(n-1)
self.scaling_term_pow_n_minus_1:zeroLtN2(scaling_term, thresh)
self.criterion_output = torch.sum(torch.cmul(self.scaling_term_pow_n, self.abs_input)) -- sum over both i for each batch, and over batches
-- L2_norm_vec: \sum_i [1 - |z_i| / (\sum_j |z_j|)]^(n-1) * |z_i|^2
torch.sum(self.L2_norm_vec, torch.cmul(self.abs_input_squared, self.scaling_term_pow_n_minus_1), input:dim())
if input:dim() == 2 then
self.L2_norm_vec = self.L2_norm_vec:select(input:dim(),1)
end
-- L2_norm_vec: \sum_i n*[1 - |z_i| / (\sum_j |z_j|)]^(n-1) * [|z_i|^2 / (\sum_j |z_j|)^2]
self.L2_norm_vec:cdiv(torch.pow(self.L1_norm_vec,2)):mul(self.exempt_max)
--self.criterion_output = torch.sum(self.L1_norm_vec) - torch.sum(self.L2_norm_vec) -- a scalar is returned when torch.sum operates over all dimensions
end
self.output = self.lambda_scaling * self.lambda * self.criterion_output
--print(self.output)
--self.output = self.criterion_output
return self.output
end
function L1CriterionModule:updateGradInput(input) -- we leave out the standard gradOutput argument here to make it clear that L1CriterionModule's gradInput is sui generis
if self.weight then
self.lambda = self.weight[1]
end
local criterion_grad_input = self.criterion:updateGradInput(input, self.target)
self.gradInput:resizeAs(criterion_grad_input)
self.gradInput:copy(criterion_grad_input)
--self.gradInput:mul(criterion_grad_input, self.lambda)
-- need to construct the vector (1 - |z_i| / \sum_j |z_j|)
if self.exempt_max and (false or (self.exempt_max < 1)) then -- don't apply the L1 loss to the largest element
self.second_term = self.second_term or torch.Tensor()
self.third_term = self.third_term or torch.Tensor()
self.second_term = torch.ger(torch.pow(self.L1_norm_vec, -1), torch.ones(input:size(input:dim())))
self.third_term = torch.ger(self.L2_norm_vec, torch.ones(input:size(input:dim()))) -- L2_norm_vec is already (\sum_i |z_i|^2) / (\sum_i |z_i|)^2
self.gradInput:cmul(torch.add(torch.ones(input:size()), self.third_term)) -- sign(z_j) + sign(z_j) * (\sum_i |z_i|^2) / (\sum_i |z_i|)^2
self.gradInput:add(-2, torch.cmul(input, self.second_term)) -- input:size(input:dim())
elseif self.exempt_max and (self.exempt_max >= 1) then
-- L2_norm_vec is already (\sum_i |z_i|^2) / (\sum_i |z_i|)^2
local first_and_second_term = torch.add(torch.ger(self.L2_norm_vec, torch.ones(input:size(input:dim()))):cmul(self.gradInput), -1,
torch.ger(torch.pow(self.L1_norm_vec, -1), torch.ones(input:size(input:dim()))):cmul(self.scaling_term_pow_n_minus_1):cmul(input):mul(self.exempt_max))
self.gradInput:cmul(self.scaling_term_pow_n) -- third term: (1 - |z_j|/(\sum_i |z_i|))^n * sign(z_j)
self.gradInput:add(first_and_second_term)
end
self.gradInput:mul(self.lambda_scaling * self.lambda)
--self.gradInput = criterion_grad_input
return self.gradInput
end
function L1CriterionModule:accGradParameters(input, gradOutput, scale)
scale = scale or 1
-- if the criterion is applied to minibatches, then the constraint enforced by the lagrange multiplier is similarly on entire minibatches, rather than on each element separately. At the very least, this might require a modification of the desired_criterion_value
if self.weight then
self.gradWeight[1] = self.gradWeight[1] - scale*learning_rate_scaling_factor*(self.criterion_output - desired_criterion_value) -- minus, since we want to maximize the error with respect to the lagrange multiplier
end
end