Source code for tricycle.activation

from tricycle.context import TRICYCLE_CONTEXT
from tricycle.functions import Sigmoid
from tricycle.initialisers import init_xavier
from tricycle.layers import Dense, Layer
from tricycle.optimisers import Optimiser
from tricycle.tensor import Tensor
from tricycle.unary import UnaryMax


[docs] class ReLU(Layer): """ Rectified Linear Unit (ReLU) activation function. This layer applies the ReLU function element-wise to the input tensor. ReLU(x) = max(0, x) """
[docs] def forward(self, x: Tensor): """ Apply the ReLU function to the input tensor. Args: x (Tensor): Input tensor. Returns: Tensor: Output tensor after applying ReLU. """ return UnaryMax()(x, 0)
[docs] class Swish(Layer): """ Swish activation function. This layer applies the Swish function element-wise to the input tensor. Swish(x) = x * sigmoid(x) Note: This implementation is equivalent to the SiLU activation function as it omits the bias term. """
[docs] def backward(self, grad: Tensor): """ Compute the gradient of the Swish function. Args: grad (Tensor): Upstream gradient. Returns: Tensor: Gradient with respect to the input. """ xp = grad.xp # Exponents tend to overflow/underflow when using 16 bit precision # so we need to switch to 32 bit if TRICYCLE_CONTEXT.use_mixed_precision: self._input = self._input.astype(xp.float32) exp = xp.exp(-self._input) numerator = 1 + exp + self._input * exp denominator = (1 + exp) ** 2 coef = numerator / denominator if TRICYCLE_CONTEXT.use_mixed_precision: coef = coef.astype(xp.float16) return Tensor(grad * coef)
[docs] def forward(self, tensor: Tensor): """ Apply the Swish function to the input tensor. Args: tensor (Tensor): Input tensor. Returns: Tensor: Output tensor after applying Swish. """ xp = tensor.xp self._input = tensor.array # Exponents tend to overflow/underflow when using 16 bit precision # so we need to switch to 32 bit if TRICYCLE_CONTEXT.use_mixed_precision: self._input = self._input.astype(xp.float32) out = tensor.array / (1 + xp.exp(-tensor.array)) if TRICYCLE_CONTEXT.use_mixed_precision: self._input = self._input.astype(xp.float16) out = out.astype(xp.float16) return Tensor( out, args=(tensor,), back_fns=(self.backward,), name="swish" )
[docs] class GeLU(Layer): """ Gaussian Error Linear Unit (GELU) activation function. This layer applies the GELU function element-wise to the input tensor. GELU(x) ≈ 0.5x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3))) Args: approximate (bool): Whether to use the approximate version of GELU. Defaults to False. """ CONST_1 = 0.7978845608028654 CONST_2 = 0.044715 def __init__(self, *args, approximate: bool = False, **kwargs): """ Initialize the GELU layer. Args: approximate (bool): Whether to use the approximate version of GELU. Defaults to False. *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. """ super().__init__(*args, **kwargs) self.approximate = approximate
[docs] def backward(self, grad: Tensor): """ Compute the gradient of the GELU function. Args: grad (Tensor): Upstream gradient. Returns: Tensor: Gradient with respect to the input. """ xp = grad.xp # Hyperbolic trig functions (cosh and tanh) use exponents under the # hood which can overflow/underflow when using 16 bit precision so # we need to switch to 32 bit precision if TRICYCLE_CONTEXT.use_mixed_precision: self._input = self._input.astype(xp.float32) inner = ( self.CONST_1 * self._input * (1 + self.CONST_2 * self._input**2) ) coef = ( self.CONST_1 * self._input * (1 + self.CONST_2 * 3 * self._input**2) ) left = xp.tanh(inner) cosh = xp.cosh(inner) right = coef / (cosh * cosh) if TRICYCLE_CONTEXT.use_mixed_precision: left = left.astype(xp.float16) right = right.astype(xp.float16) self._grad = 0.5 * (1 + left + right) * grad.array result = Tensor( self._grad, is_batched=grad.is_batched, requires_grad=grad.requires_grad, ) result.name = "gelu_back" return result
[docs] def forward(self, tensor: Tensor): """ Apply the GELU function to the input tensor. Args: tensor (Tensor): Input tensor. Returns: Tensor: Output tensor after applying GELU. """ xp = tensor.xp self._input = tensor.array # Tanh tends to overflow/underflow when using 16 bit precision # so we need to switch to 32 bit if TRICYCLE_CONTEXT.use_mixed_precision: self._input = self._input.astype(xp.float32) inner = self.CONST_1 * (self._input + self.CONST_2 * self._input**3) result = self._input * 0.5 * (1 + xp.tanh(inner)) if TRICYCLE_CONTEXT.use_mixed_precision: self._input = self._input.astype(xp.float16) result = result.astype(xp.float16) result = Tensor( result, is_batched=tensor.is_batched, requires_grad=tensor.requires_grad, ) result.name = "gelu" result.args = (tensor,) result.back_fns = (self.backward,) return result
[docs] class GLU(Layer): """ Gated Linear Unit (GLU) activation function. This layer applies the GLU function to the input tensor. GLU(x) = x_left * sigmoid(x_right) Args: size (int): Size of the input tensor. initialiser (callable): Function to initialize the weights. Defaults to init_xavier. """ linear: Dense def __init__(self, size: int, initialiser=init_xavier, *args, **kwargs): """ Initialize the GLU layer. Args: size (int): Size of the input tensor. initialiser (callable): Function to initialize the weights. Defaults to init_xavier. *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. """ super().__init__(*args, **kwargs) self.linear = Dense(size, 2 * size, initialiser) self.layers = [self.linear] self.sigmoid = Sigmoid()
[docs] def forward(self, x: Tensor): """ Apply the GLU function to the input tensor. Args: x (Tensor): Input tensor. Returns: Tensor: Output tensor after applying GLU. """ x = self.linear(x) left, right = x.split(2) return left * self.sigmoid(right)
[docs] def update(self, optimiser: Optimiser): """ Update the layer parameters using the given optimizer. Args: optimiser (Optimiser): The optimizer to use for updating parameters. """ self.linear.update(optimiser)
[docs] def zero_grad(self): """ Reset the gradients of the layer parameters to zero. """ self.linear.zero_grad()
[docs] def to_gpu(self): """ Move the layer parameters to GPU memory. """ self.linear.to_gpu()
[docs] def from_gpu(self): """ Move the layer parameters from GPU to CPU memory. """ self.linear.from_gpu()