Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions colossalai/kernel/jit/bias_gelu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import torch


###### BIAS GELU FUSION/ NO AUTOGRAD ################
# 1/sqrt(2*pi)-> 0.3989423
# 1/sqrt(2) -> 0.70710678
Expand All @@ -9,10 +8,12 @@
# actual gelu is:
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))


@torch.jit.script
def bias_gelu(bias, y):
x = bias + y
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))


# gradient of tanh approximation of gelu
# gradient of actual gelu is:
Expand All @@ -23,9 +24,11 @@ def bias_gelu_back(g, bias, y):
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
return ff*g
return ff * g


class GeLUFunction(torch.autograd.Function):

@staticmethod
# bias is an optional argument
def forward(ctx, input, bias):
Expand All @@ -38,4 +41,5 @@ def backward(ctx, grad_output):
tmp = bias_gelu_back(grad_output, bias, input)
return tmp, tmp

bias_gelu_impl = GeLUFunction.apply

bias_gelu_impl = GeLUFunction.apply