Source code for tricycle.models

"""
GPT model implementation using the Tricycle framework.

This module defines the GPT class, which implements a GPT-style transformer model
using components from the Tricycle framework.
"""

import humanize
import numpy as np

from tricycle.blocks import GPT2TransformerBlock
from tricycle.configs import GPTConfig
from tricycle.layers import (
    Dense,
    Dropout,
    Embedding,
    Layer,
    LayerNorm,
    RMSNorm,
)
from tricycle.optimisers import Optimiser
from tricycle.tensor import Tensor


[docs] class GPT(Layer): """ Generative Pre-trained Transformer (GPT) model implementation. This class implements a GPT-style transformer model using components from the Tricycle framework. It includes token and position embeddings, multiple transformer blocks, and a final output layer. Attributes: embedding_dim (int): Dimension of the embedding space. context_window (int): Size of the context window for position embeddings. token_embedding (Embedding): Embedding layer for input tokens. position_embedding (Embedding): Embedding layer for positional information. input_dropout (Dropout): Dropout layer applied to the input embeddings. blocks (list): List of GPT2TransformerBlock instances. head (Dense): Final dense layer for output. norm (LayerNorm or RMSNorm): Normalization layer. layers (list): List of all layers in the model. """ def __init__(self, config: GPTConfig): """ Initializes the GPT model with the given configuration. Args: config (GPTConfig): Configuration object containing model parameters. """ self.embedding_dim = config.embedding_dim self.context_window = config.context_window self.token_embedding = Embedding( to_size=self.embedding_dim, from_size=config.vocab_size, name="token_embedding", ) self.position_embedding = Embedding( to_size=self.embedding_dim, from_size=self.context_window, name="position_embedding", ) self.input_dropout = Dropout(config.input_dropout_prob) self.blocks = [ GPT2TransformerBlock( embedding_dim=self.embedding_dim, n_heads=config.n_heads, context_window=self.context_window, expansion_ratio=config.expansion_ratio, activation_fn=config.activation_fn, norm_fn=config.norm_fn, ) for _ in range(config.n_layers) ] self.head = Dense( to_size=config.vocab_size, from_size=self.embedding_dim, name="head", ) match config.norm_fn: case "layer_norm": self.norm = LayerNorm(self.embedding_dim) case "rms_norm": self.norm = RMSNorm(self.embedding_dim) case _: raise ValueError(f"Unknown norm: {config.norm_fn}") self.layers = [ self.token_embedding, self.position_embedding, self.input_dropout, *self.blocks, self.norm, self.head, ]
[docs] def forward(self, tensor: Tensor) -> Tensor: """ Performs a forward pass through the GPT model. Args: tensor (Tensor): Input tensor, expected to be one-hot encoded. Returns: Tensor: Output tensor after passing through the model. Raises: AssertionError: If the input tensor doesn't match the expected context window size. """ xp = tensor.xp if tensor.ndim == 1: n_tokens = tensor.shape[-1] tensor.array = xp.expand_dims(tensor.array, 0) tensor = tensor.to_batched() else: n_tokens = tensor.shape[-1] assert n_tokens == self.context_window, ( "Expected a full context window. ", f"Found {n_tokens=} and {self.context_window=}", ) position = Tensor( xp.arange(self.context_window), requires_grad=False, dtype=int, ) pos_embedding = self.position_embedding(position) token_embedding = self.token_embedding(tensor) embedding = token_embedding + pos_embedding embedding = self.input_dropout(embedding) for i, block in enumerate(self.blocks): embedding = block(embedding) embedding = self.norm(embedding) embedding = self.head(embedding) return embedding
[docs] def zero_grad(self): """ Zeroes out the gradients of all layers in the model. Returns: GPT: The current GPT instance. """ self.token_embedding.zero_grad() self.position_embedding.zero_grad() self.norm.zero_grad() self.head.zero_grad() for block in self.blocks: block.zero_grad() return self
[docs] def update(self, optimiser: Optimiser): """ Updates all layers in the model using the provided optimiser. Args: optimiser (Optimiser): The optimiser to use for updating model parameters. Returns: GPT: The current GPT instance. """ self.token_embedding.update(optimiser) self.position_embedding.update(optimiser) self.norm.update(optimiser) self.head.update(optimiser) for block in self.blocks: block.update(optimiser) return self
[docs] def to_gpu(self, device: int = 0): """ Moves all layers of the model to the specified GPU device. Args: device (int, optional): The GPU device number. Defaults to 0. Returns: GPT: The current GPT instance. """ self.token_embedding.to_gpu(device) self.position_embedding.to_gpu(device) for block in self.blocks: block.to_gpu(device) self.norm.to_gpu(device) self.head.to_gpu(device) return self
[docs] def from_gpu(self): """ Moves all layers of the model from GPU back to CPU. Returns: GPT: The current GPT instance. """ self.token_embedding.from_gpu() self.position_embedding.from_gpu() for block in self.blocks: block.from_gpu() self.norm.from_gpu() self.head.from_gpu() return self
[docs] def display(self): """Prints a string representation of the model.""" print(self)
def _contents(self): """ Returns a flattened list of the layers in this model, along with their depth in the tree of layers. Returns: list: A list of tuples containing layer name, size, and depth. """ stack = [(self, 0)] contents = [] while stack: node, indent = stack.pop() tensors = list(node.tensors.values()) shapes = [t.shape for t in tensors] size = sum(np.prod(shape) for shape in shapes) contents.append((node.__class__.__name__, size, indent)) stack.extend((layer, indent + 1) for layer in node.layers[::-1]) return contents def __str__(self): """ Returns a string representation of the model, including layer sizes and total parameter count. Returns: str: A formatted string representing the model structure and size. """ string = "" total = 0 for layer, size, n_indent in self._contents(): total += size size = humanize.scientific(size) if size else "" indent = " " * n_indent string += f"{indent}{layer}({size})\n" PARAM_SIZE = self.head.weights[0][0].dtype.itemsize total *= PARAM_SIZE string += "Total size:\n" string += f" - {humanize.naturalsize(total)}\n" string += "Total parameters:\n" string += f" - {humanize.intword(total/PARAM_SIZE)}\n" return string