Source code for tricycle.configs

"""Configurations for different GPT models.

This module contains configuration classes for various GPT models, including
a base configuration class and specific configurations for debugging,
Shakespeare-based models, and a small GPT model.

Classes:
    GPTConfig: Base configuration class for GPT models.
    DebugConfig: Configuration for debugging purposes.
    ShakespeareConfig: Configuration for Shakespeare-based models.
    SmolGPTConfig: Configuration for a small GPT model.
"""

from typing import Literal


[docs] class GPTConfig: """Base configuration class for GPT models. This class defines the common parameters and hyperparameters used in GPT model training and evaluation. Attributes: embedding_dim (int): Dimension of the embedding layer. context_window (int): Size of the context window. vocab_size (int): Size of the vocabulary. n_heads (int): Number of attention heads. n_layers (int): Number of transformer layers. expansion_ratio (float): Expansion ratio for feed-forward layers. activation_fn (str): Activation function used in the model. norm_fn (str): Normalization function used in the model. input_dropout_prob (float): Dropout probability for input embeddings. residual_dropout_prob (float): Dropout probability for residual connections. linear_dropout_prob (float): Dropout probability for linear layers. max_learning_rate (float): Maximum learning rate for training. min_learning_rate (float): Minimum learning rate for training. warmup_steps (int): Number of warmup steps for learning rate scheduling. weight_decay (float): Weight decay factor for regularization. momentum (float): Momentum factor for optimization. beta1 (float): Beta1 parameter for Adam optimizer. beta2 (float): Beta2 parameter for Adam optimizer. steps (int | Literal["chinchilla_optimal"]): Number of training steps or "chinchilla_optimal". eval_interval (int): Interval between evaluations. batch_size (int): Batch size for training. gradient_accumulation_steps (int): Number of steps for gradient accumulation. device_idx (int): Index of the device to use for training. mlflow_tracking_uri (str): URI for MLflow tracking server. mlflow_experiment_name (str): Name of the MLflow experiment. """ embedding_dim: int context_window: int vocab_size: int n_heads: int n_layers: int expansion_ratio: float activation_fn: str norm_fn: str input_dropout_prob: float residual_dropout_prob: float linear_dropout_prob: float max_learning_rate: float min_learning_rate: float warmup_steps: int weight_decay: float momentum = float beta1: float beta2: float steps: int | Literal["chinchilla_optimal"] eval_interval: int batch_size: int gradient_accumulation_steps: int device_idx: int mlflow_tracking_uri: str mlflow_experiment_name: str
[docs] def dict(self) -> dict[str, int | float | str | bool]: """Convert the configuration to a dictionary. Returns: dict[str, int | float | str | bool]: A dictionary representation of the configuration. """ out = {} for k, v in self.__class__.__dict__.items(): if k.startswith("__"): continue if callable(v): continue out[k] = v return out
[docs] class DebugConfig(GPTConfig): """Configuration for debugging purposes. This class inherits from GPTConfig and sets specific values for debugging. """ embedding_dim = 14 context_window = 13 vocab_size = 11 n_heads = 2 n_layers = 1 expansion_ratio = 4 activation_fn = "gelu" norm_fn = "layer_norm" input_dropout_prob = 0.2 residual_dropout_prob = 0.2 linear_dropout_prob = 0.2 max_learning_rate = 1e-3 min_learning_rate = 1e-4 warmup_steps = 100 weight_decay = 1e-1 momentum = 0 beta1 = 0.9 beta2 = 0.99 steps = 250 eval_interval = 1 eval_steps = 1 batch_size = 5 gradient_accumulation_steps = 1 sample_size = 4 device_idx = 0 mlflow_enabled = False mlflow_tracking_uri = ""
[docs] class ShakespeareConfig(GPTConfig): """Configuration for Shakespeare-based models. This class inherits from GPTConfig and sets specific values for Shakespeare-based language models. """ embedding_dim = 384 context_window = 256 vocab_size = 1024 n_heads = 6 n_layers = 6 expansion_ratio = 4 activation_fn = "gelu" norm_fn = "layer_norm" input_dropout_prob = 0.2 residual_dropout_prob = 0.2 linear_dropout_prob = 0.2 max_learning_rate = 1e-2 min_learning_rate = 1e-4 warmup_steps = 100 weight_decay = 1e-1 momentum = 0 beta1 = 0.9 beta2 = 0.99 steps = 5000 eval_interval = 250 eval_steps = 128 batch_size = 128 gradient_accumulation_steps = 1 sample_size = 512 device_idx = 1 mlflow_enabled = True mlflow_tracking_uri = "http://localhost:5000"
[docs] class SmolGPTConfig(GPTConfig): """Configuration for a small GPT model. This class inherits from GPTConfig and sets specific values for a small-scale GPT model. """ embedding_dim = 768 context_window = 1024 vocab_size = 50256 n_heads = 12 n_layers = 12 expansion_ratio = 4 activation_fn = "gelu" norm_fn = "layer_norm" input_dropout_prob = 0 residual_dropout_prob = 0 linear_dropout_prob = 0 max_learning_rate = 6e-4 min_learning_rate = 0 warmup_steps = 150 # roughly matches andrej's warmup steps in llm.c weight_decay = 1e-1 momentum = 0 beta1 = 0.9 beta2 = 0.95 steps = "chinchilla_optimal" eval_interval = 100 eval_steps = 128 batch_size = 4 gradient_accumulation_steps = 128 # effective batch size of 524288 tokens n_tokens_to_generate = 512 device_idx = 0 mlflow_enabled = True mlflow_tracking_uri = "http://localhost:5000"