Source code for tricycle.blocks

"""
Several layers can be grouped together into a single layer called a block.

This module provides various block implementations used in transformer-based
models, including multi-head self-attention, MLP blocks, and transformer blocks.
"""

from typing import Literal

import numpy as np

from tricycle.activation import GLU, GeLU, ReLU, Swish
from tricycle.attention import Attention
from tricycle.initialisers import init_xavier
from tricycle.layers import (  # noqa E501
    Dense,
    Dropout,
    Layer,
    LayerNorm,
    RMSNorm,
)
from tricycle.optimisers import Optimiser
from tricycle.tensor import Tensor


[docs] def build_mask(context_window: int) -> Tensor: """ Build an attention mask to stop the model from being able to see future tokens. Args: context_window (int): The size of the context window. Returns: Tensor: A mask tensor with shape (context_window, context_window). """ NEGATIVE_INFINITY = -10_000 mask = np.ones((context_window, context_window)) idx = np.tril(mask.astype(bool)) mask[~idx] = NEGATIVE_INFINITY mask[idx] = 0 return Tensor(mask, requires_grad=False, name="mask")
[docs] def masked_fill( tensor: Tensor, mask_shape: tuple[int, int], full_mask: Tensor ): """ Apply an attention_mask to a tensor. Args: tensor (Tensor): The input tensor to be masked. mask_shape (tuple[int, int]): The shape of the mask to be applied. full_mask (Tensor): The full mask tensor. Returns: Tensor: The masked tensor. """ xp = tensor.xp repeats = tensor.shape[1] if tensor.is_batched else tensor.shape[0] mask = xp.stack([full_mask[: mask_shape[0], : mask_shape[1]]] * repeats) mask = Tensor(mask, requires_grad=False, name="mask") result = tensor + mask result.name = "masked" return result
[docs] class MultiHeadSelfAttention(Layer): """ Multi-head self-attention layer. This layer implements the multi-head self-attention mechanism used in transformer models. Attributes: embedding_dim (int): The dimension of the input embeddings. n_heads (int): The number of attention heads. context_window (int): The size of the context window. """ embedding_dim: int n_heads: int context_window: int def __init__( self, embedding_dim: int, n_heads: int, context_window: int, residual_dropout_prob: float = 0.0, initialiser=init_xavier, ): """ Initialize the MultiHeadSelfAttention layer. Args: embedding_dim (int): The dimension of the input embeddings. n_heads (int): The number of attention heads. context_window (int): The size of the context window. residual_dropout_prob (float, optional): The dropout probability for residual connections. Defaults to 0.0. initialiser (function, optional): The initializer function for weights. Defaults to init_xavier. """ # set the constants self.embedding_dim = embedding_dim self.n_heads = n_heads self.context_window = context_window # Project the embedding into 3 embeddings. One for each of key, query # and value self.in_projection = Dense( from_size=self.embedding_dim, to_size=self.embedding_dim * 3, initialiser=initialiser, name="in_projection", ) # Pass the final embedding through a linear layer self.out_projection = Dense( from_size=self.embedding_dim, to_size=self.embedding_dim, initialiser=initialiser, name="out_projection", ) self.residual_dropout = Dropout(residual_dropout_prob) self.layers = [ self.in_projection, self.residual_dropout, self.out_projection, ] self.attention = Attention( embedding_dim=embedding_dim, n_heads=n_heads, context_window=context_window, )
[docs] def forward(self, tensor: Tensor): """ Perform a forward pass through the multi-head self-attention layer. Args: tensor (Tensor): The input tensor. Returns: Tensor: The output tensor after applying multi-head self-attention. """ # expand the input tensor = self.in_projection(tensor) attention = self.attention(tensor) # project back projected = self.out_projection(attention) projected = self.residual_dropout(projected) return projected
[docs] def update(self, optimiser: Optimiser): """ Update the layer's parameters using the given optimizer. Args: optimiser (Optimiser): The optimizer to use for updating parameters. """ self.in_projection.update(optimiser) self.out_projection.update(optimiser)
[docs] def zero_grad(self): """ Zero out the gradients of the layer's parameters. """ self.in_projection.zero_grad() self.out_projection.zero_grad()
[docs] def to_gpu(self, device: int = 0): """ Move the layer's parameters to the GPU. Args: device (int, optional): The GPU device number. Defaults to 0. """ self.in_projection.to_gpu(device) self.out_projection.to_gpu(device) self.attention.to_gpu(device)
[docs] def from_gpu(self): """ Move the layer's parameters from the GPU to the CPU. """ self.in_projection.from_gpu() self.out_projection.from_gpu() self.attention.from_gpu()
[docs] class MLPBlock(Layer): """ A simple GPT-2 style MLP block with 2 linear layers around an activation function. The size of the hidden dimension is expansion_ratio * the size of the input. Attributes: embedding_dim (int): The dimension of the input embeddings. dropout_prob (float): The dropout probability. expansion_ratio (float): The ratio for expanding the hidden dimension. activation_fn (Layer): The activation function to use. linear_1 (Dense): The first linear layer. linear_2 (Dense): The second linear layer. """ embedding_dim: int dropout_prob: float expansion_ratio: float activation_fn: Layer linear_1: Dense linear_2: Dense def __init__( self, embedding_dim: int, dropout_prob: float, expansion_ratio: float = 4, activation_fn: Layer | str = GeLU(), ): """ Initialize the MLPBlock. Args: embedding_dim (int): The dimension of the input embeddings. dropout_prob (float): The dropout probability. expansion_ratio (float, optional): The ratio for expanding the hidden dimension. Defaults to 4. activation_fn (Layer | str, optional): The activation function to use. Defaults to GeLU(). """ self.linear_1 = Dense( from_size=embedding_dim, to_size=int(expansion_ratio * embedding_dim), initialiser=init_xavier, name="linear_1", ) self.linear_2 = Dense( from_size=int(expansion_ratio * embedding_dim), to_size=embedding_dim, initialiser=init_xavier, name="linear_2", ) self.dropout = Dropout(dropout_prob) if isinstance(activation_fn, str): match activation_fn: case "gelu": activation_fn = GeLU() case "relu": activation_fn = ReLU() case "swish": activation_fn = Swish() case "glu": activation_fn = GLU(int(expansion_ratio * embedding_dim)) case _: raise NotImplementedError( f"Activation function {activation_fn} is not " "yet implemented" ) self.activation_fn = activation_fn self.layers = [ self.linear_1, self.activation_fn, self.linear_2, self.dropout, ]
[docs] def forward(self, x: Tensor): """ Perform a forward pass through the MLP block. Args: x (Tensor): The input tensor. Returns: Tensor: The output tensor after applying the MLP block. """ x = self.linear_1(x) x = self.activation_fn(x) x = self.linear_2(x) x = self.dropout(x) return x
[docs] def update(self, optimiser: Optimiser): """ Update the layer's parameters using the given optimizer. Args: optimiser (Optimiser): The optimizer to use for updating parameters. Returns: MLPBlock: The updated MLPBlock instance. """ self.linear_1.update(optimiser) self.linear_2.update(optimiser) return self
[docs] def zero_grad(self): """ Zero out the gradients of the layer's parameters. Returns: MLPBlock: The MLPBlock instance with zeroed gradients. """ self.linear_1.zero_grad() self.linear_2.zero_grad() return self
[docs] def to_gpu(self, device: int = 0): """ Move the layer's parameters to the GPU. Args: device (int, optional): The GPU device number. Defaults to 0. Returns: MLPBlock: The MLPBlock instance with parameters moved to GPU. """ self.linear_1.to_gpu(device) self.linear_2.to_gpu(device) return self
[docs] def from_gpu(self): """ Move the layer's parameters from the GPU to the CPU. Returns: MLPBlock: The MLPBlock instance with parameters moved to CPU. """ self.linear_1.from_gpu() self.linear_2.from_gpu() return self
[docs] class GPT2TransformerBlock(Layer): """ A GPT-2 style transformer block. This block combines multi-head self-attention with an MLP block and includes normalization and residual connections. Attributes: embedding_dim (int): The dimension of the input embeddings. expansion_ratio (float): The ratio for expanding the hidden dimension in the MLP block. activation_fn (Layer): The activation function to use in the MLP block. residual_dropout_prob (float): The dropout probability for residual connections. linear_dropout_prob (float): The dropout probability for the MLP block. """ embedding_dim: int expansion_ratio: float activation_fn: Layer residual_dropout_prob: float linear_dropout_prob: float def __init__( self, embedding_dim: int, n_heads: int, context_window: int, expansion_ratio: float = 4, activation_fn: Layer | str = GeLU(), norm_fn: Literal["layer_norm"] | Literal["rms_norm"] = "layer_norm", residual_dropout_prob: float = 0, linear_dropout_prob: float = 0, ): """ Initialize the GPT2TransformerBlock. Args: embedding_dim (int): The dimension of the input embeddings. n_heads (int): The number of attention heads. context_window (int): The size of the context window. expansion_ratio (float, optional): The ratio for expanding the hidden dimension in the MLP block. Defaults to 4. activation_fn (Layer | str, optional): The activation function to use in the MLP block. Defaults to GeLU(). norm_fn (Literal["layer_norm"] | Literal["rms_norm"], optional): The normalization function to use. Defaults to "layer_norm". residual_dropout_prob (float, optional): The dropout probability for residual connections. Defaults to 0. linear_dropout_prob (float, optional): The dropout probability for the MLP block. Defaults to 0. """ self.attention_block = MultiHeadSelfAttention( embedding_dim, n_heads=n_heads, context_window=context_window, residual_dropout_prob=residual_dropout_prob, initialiser=init_xavier, ) self.mlp_block = MLPBlock( embedding_dim, linear_dropout_prob, expansion_ratio, activation_fn, ) match norm_fn: case "layer_norm": self.norm_1 = LayerNorm(embedding_dim) self.norm_2 = LayerNorm(embedding_dim) case "rms_norm": self.norm_1 = RMSNorm(embedding_dim) self.norm_2 = RMSNorm(embedding_dim) case _: raise ValueError(f"Unknown norm: {norm_fn}") self.layers = [ self.norm_1, self.attention_block, self.norm_2, self.mlp_block, ]
[docs] def forward(self, x: Tensor): """ Perform a forward pass through the GPT-2 transformer block. Args: x (Tensor): The input tensor. Returns: Tensor: The output tensor after applying the transformer block. """ normed = self.norm_1(x) attn = self.attention_block(normed) attn += x x = self.norm_2(attn) x = self.mlp_block(x) x += attn return x
[docs] def update(self, optimiser: Optimiser): """ Update the layer's parameters using the given optimizer. Args: optimiser (Optimiser): The optimizer to use for updating parameters. """ self.attention_block.update(optimiser) self.mlp_block.update(optimiser) self.norm_1.update(optimiser) self.norm_2.update(optimiser)
[docs] def zero_grad(self): """ Zero out the gradients of the layer's parameters. """ self.attention_block.zero_grad() self.mlp_block.zero_grad() self.norm_1.zero_grad() self.norm_2.zero_grad()
[docs] def to_gpu(self, device: int = 0): """ Move the layer's parameters to the GPU. Args: device (int, optional): The GPU device number. Defaults to 0. """ self.attention_block.to_gpu(device) self.mlp_block.to_gpu(device) self.norm_1.to_gpu(device) self.norm_2.to_gpu(device)
[docs] def from_gpu(self): """ Move the layer's parameters from the GPU to the CPU. """ self.attention_block.from_gpu() self.mlp_block.from_gpu() self.norm_1.from_gpu() self.norm_2.from_gpu()
[docs] class FeedForward(Layer): """A simple llama style feed forward block with 2 linear layers around a swiglu function. The size of the hidden dimension is expansion_ratio * the size of the input. Attributes: embedding_dim: The dimension of the input embedding. dropout_prob: The probability of dropout. expansion_ratio: The ratio to expand the hidden dimension. activation_fn: The activation function to use. linear_1: The first linear layer. linear_2: The second linear layer. Args: embedding_dim: The dimension of the input embedding. dropout_prob: The probability of dropout. expansion_ratio: The ratio to expand the hidden dimension. Defaults to 4. activation_fn: The activation function to use. Can be a Layer object or a string. Defaults to GeLU(). """ embedding_dim: int dropout_prob: float expansion_ratio: float activation_fn: Layer linear_1: Dense linear_2: Dense def __init__( self, embedding_dim: int, dropout_prob: float, expansion_ratio: float = 4, activation_fn: Layer | str = GeLU(), ): self.linear_1 = Dense( from_size=embedding_dim, to_size=int(expansion_ratio * embedding_dim), initialiser=init_xavier, name="linear_1", ) self.linear_2 = Dense( from_size=int(expansion_ratio * embedding_dim), to_size=embedding_dim, initialiser=init_xavier, name="linear_2", ) self.dropout = Dropout(dropout_prob) if isinstance(activation_fn, str): match activation_fn: case "gelu": activation_fn = GeLU() case "relu": activation_fn = ReLU() case "swish": activation_fn = Swish() case "glu": activation_fn = GLU(int(expansion_ratio * embedding_dim)) case "swiglu": activation_fn = SwiGLU( int(expansion_ratio * embedding_dim) ) case _: raise NotImplementedError( f"Activation function {activation_fn} is not " "yet implemented" ) self.activation_fn = activation_fn self.layers = [ self.linear_1, self.activation_fn, self.linear_2, self.dropout, ]
[docs] def forward(self, x: Tensor) -> Tensor: """Forward pass of the FeedForward layer. Args: x: Input tensor. Returns: The output tensor after passing through the feed-forward block. """ x = self.linear_1(x) x = self.activation_fn(x) x = self.linear_2(x) x = self.dropout(x) return x
[docs] def update(self, optimiser: Optimiser) -> "FeedForward": """Update the parameters of the layer using the given optimiser. Args: optimiser: The optimiser to use for updating the parameters. Returns: The updated FeedForward layer. """ self.linear_1.update(optimiser) self.linear_2.update(optimiser) return self
[docs] def zero_grad(self) -> "FeedForward": """Zero out the gradients of the layer's parameters. Returns: The FeedForward layer with zeroed gradients. """ self.linear_1.zero_grad() self.linear_2.zero_grad() return self
[docs] def to_gpu(self, device: int = 0) -> "FeedForward": """Move the layer to the GPU. Args: device: The GPU device number to move the layer to. Defaults to 0. Returns: The FeedForward layer moved to the GPU. """ self.linear_1.to_gpu(device) self.linear_2.to_gpu(device) return self
[docs] def from_gpu(self) -> "FeedForward": """Move the layer from the GPU to the CPU. Returns: The FeedForward layer moved to the CPU. """ self.linear_1.from_gpu() self.linear_2.from_gpu() return self