"""Configurations for different GPT models.This module contains configuration classes for various GPT models, includinga base configuration class and specific configurations for debugging,Shakespeare-based models, and a small GPT model.Classes: GPTConfig: Base configuration class for GPT models. DebugConfig: Configuration for debugging purposes. ShakespeareConfig: Configuration for Shakespeare-based models. SmolGPTConfig: Configuration for a small GPT model."""fromtypingimportLiteral
[docs]classGPTConfig:"""Base configuration class for GPT models. This class defines the common parameters and hyperparameters used in GPT model training and evaluation. Attributes: embedding_dim (int): Dimension of the embedding layer. context_window (int): Size of the context window. vocab_size (int): Size of the vocabulary. n_heads (int): Number of attention heads. n_layers (int): Number of transformer layers. expansion_ratio (float): Expansion ratio for feed-forward layers. activation_fn (str): Activation function used in the model. norm_fn (str): Normalization function used in the model. input_dropout_prob (float): Dropout probability for input embeddings. residual_dropout_prob (float): Dropout probability for residual connections. linear_dropout_prob (float): Dropout probability for linear layers. max_learning_rate (float): Maximum learning rate for training. min_learning_rate (float): Minimum learning rate for training. warmup_steps (int): Number of warmup steps for learning rate scheduling. weight_decay (float): Weight decay factor for regularization. momentum (float): Momentum factor for optimization. beta1 (float): Beta1 parameter for Adam optimizer. beta2 (float): Beta2 parameter for Adam optimizer. steps (int | Literal["chinchilla_optimal"]): Number of training steps or "chinchilla_optimal". eval_interval (int): Interval between evaluations. batch_size (int): Batch size for training. gradient_accumulation_steps (int): Number of steps for gradient accumulation. device_idx (int): Index of the device to use for training. mlflow_tracking_uri (str): URI for MLflow tracking server. mlflow_experiment_name (str): Name of the MLflow experiment. """embedding_dim:intcontext_window:intvocab_size:intn_heads:intn_layers:intexpansion_ratio:floatactivation_fn:strnorm_fn:strinput_dropout_prob:floatresidual_dropout_prob:floatlinear_dropout_prob:floatmax_learning_rate:floatmin_learning_rate:floatwarmup_steps:intweight_decay:floatmomentum=floatbeta1:floatbeta2:floatsteps:int|Literal["chinchilla_optimal"]eval_interval:intbatch_size:intgradient_accumulation_steps:intdevice_idx:intmlflow_tracking_uri:strmlflow_experiment_name:str
[docs]defdict(self)->dict[str,int|float|str|bool]:"""Convert the configuration to a dictionary. Returns: dict[str, int | float | str | bool]: A dictionary representation of the configuration. """out={}fork,vinself.__class__.__dict__.items():ifk.startswith("__"):continueifcallable(v):continueout[k]=vreturnout
[docs]classDebugConfig(GPTConfig):"""Configuration for debugging purposes. This class inherits from GPTConfig and sets specific values for debugging. """embedding_dim=14context_window=13vocab_size=11n_heads=2n_layers=1expansion_ratio=4activation_fn="gelu"norm_fn="layer_norm"input_dropout_prob=0.2residual_dropout_prob=0.2linear_dropout_prob=0.2max_learning_rate=1e-3min_learning_rate=1e-4warmup_steps=100weight_decay=1e-1momentum=0beta1=0.9beta2=0.99steps=250eval_interval=1eval_steps=1batch_size=5gradient_accumulation_steps=1sample_size=4device_idx=0mlflow_enabled=Falsemlflow_tracking_uri=""
[docs]classShakespeareConfig(GPTConfig):"""Configuration for Shakespeare-based models. This class inherits from GPTConfig and sets specific values for Shakespeare-based language models. """embedding_dim=384context_window=256vocab_size=1024n_heads=6n_layers=6expansion_ratio=4activation_fn="gelu"norm_fn="layer_norm"input_dropout_prob=0.2residual_dropout_prob=0.2linear_dropout_prob=0.2max_learning_rate=1e-2min_learning_rate=1e-4warmup_steps=100weight_decay=1e-1momentum=0beta1=0.9beta2=0.99steps=5000eval_interval=250eval_steps=128batch_size=128gradient_accumulation_steps=1sample_size=512device_idx=1mlflow_enabled=Truemlflow_tracking_uri="http://localhost:5000"
[docs]classSmolGPTConfig(GPTConfig):"""Configuration for a small GPT model. This class inherits from GPTConfig and sets specific values for a small-scale GPT model. """embedding_dim=768context_window=1024vocab_size=50256n_heads=12n_layers=12expansion_ratio=4activation_fn="gelu"norm_fn="layer_norm"input_dropout_prob=0residual_dropout_prob=0linear_dropout_prob=0max_learning_rate=6e-4min_learning_rate=0warmup_steps=150# roughly matches andrej's warmup steps in llm.cweight_decay=1e-1momentum=0beta1=0.9beta2=0.95steps="chinchilla_optimal"eval_interval=100eval_steps=128batch_size=4gradient_accumulation_steps=128# effective batch size of 524288 tokensn_tokens_to_generate=512device_idx=0mlflow_enabled=Truemlflow_tracking_uri="http://localhost:5000"