Source code for tricycle.dataset

import random
from typing import Sequence

import numpy as np

from tricycle.tensor import Tensor


[docs] class Dataset: """ An in-memory dataset: not suitable for large datasets. This class represents a basic dataset with inputs and corresponding outputs. It supports iteration, indexing, and shuffling of data. Attributes: inputs: A sequence of input data. outputs: A sequence of output data corresponding to the inputs. _indices: A list of indices for accessing data. _index: The current index for iteration. Args: inputs: A sequence of input data. outputs: A sequence of output data. Raises: AssertionError: If the length of inputs and outputs are not equal. """ def __init__(self, inputs: Sequence, outputs: Sequence): assert len(inputs) == len(outputs) self.inputs = inputs self.outputs = outputs self._indices = list(range(len(inputs))) self._index = 0 def __iter__(self): """Returns the dataset object as an iterator.""" return self def __next__(self): """ Returns the next item in the dataset. Raises: StopIteration: If all items have been iterated over. """ if self._index >= len(self.inputs): raise StopIteration result = self[self._index] self._index += 1 return result def __len__(self): """Returns the number of items in the dataset.""" return len(self.inputs) def __getitem__(self, idx: int): """ Returns the item at the specified index. Args: idx: The index of the item to retrieve. Returns: A tuple containing the input and output at the specified index. """ idx = self._indices[idx] return self.inputs[idx], self.outputs[idx]
[docs] def shuffle(self): """ Shuffles the dataset indices. Returns: The dataset object with shuffled indices. """ np.random.shuffle(self._indices) return self
[docs] def to_tensor(self): """ Converts inputs and outputs to Tensor objects. Returns: The dataset object with inputs and outputs as Tensors. """ self.inputs = [Tensor(x) for x in self.inputs] self.outputs = [Tensor(x) for x in self.outputs] return self
[docs] def reset(self): """ Resets the iteration index to 0. Returns: The dataset object with reset index. """ self._index = 0 return self
[docs] def copy(self): """ Creates a shallow copy of the dataset. Returns: A new Dataset object with copied inputs and outputs. """ return Dataset(self.inputs.copy(), self.outputs.copy())
[docs] class InfiniteBatchDataset(Dataset): """ An infinite batch dataset that generates random batches. This class extends the Dataset class to provide infinite batches of data. It randomly selects items from the dataset to form batches. Attributes: is_infinite: A boolean indicating if the dataset is infinite. _to_tensor: A boolean indicating if the data should be converted to tensors. is_batched: A boolean indicating if the data is batched. batch_size: The size of each batch. Args: inputs: A sequence of input data. outputs: A sequence of output data. batch_size: The size of each batch. """ is_infinite = True _to_tensor = False is_batched = True def __init__(self, inputs: Sequence, outputs: Sequence, batch_size: int): super().__init__(inputs, outputs) self.batch_size = batch_size def __next__(self): """Returns the next batch of items.""" result = self[self._index] self._index += 1 return result def __len__(self): """Returns -1 to indicate an infinite dataset.""" return -1 def __getitem__(self, idx: int): """ Returns a randomly generated batch of items. Args: idx: The index used as a seed for random generation. Returns: A tuple containing batches of inputs and outputs. """ random.seed(idx) indices = [ random.randint(0, len(self.inputs) - 1) for _ in range(self.batch_size) ] batch_inputs = np.vstack([self.inputs[i] for i in indices]) batch_outputs = np.vstack([self.outputs[i] for i in indices]) if self._to_tensor: batch_inputs = Tensor( batch_inputs, is_batched=self.is_batched, dtype=batch_outputs.dtype, ) batch_outputs = Tensor( batch_outputs, is_batched=self.is_batched, dtype=batch_outputs.dtype, ) return batch_inputs, batch_outputs
[docs] def to_tensor(self): """ Sets the flag to convert data to tensors. Returns: The dataset object with _to_tensor flag set to True. """ self._to_tensor = True return self
[docs] class CausalLMDataset: """ A dataset for causal language modeling tasks. This class provides functionality for creating batches of token sequences for training causal language models. Attributes: tokens: The input token sequence. vocab_size: The size of the vocabulary. batch_size: The size of each batch. context_window: The size of the context window. is_batch: A boolean indicating if the data is batched. as_tensor: A boolean indicating if the data should be returned as tensors. _idx: The current index for iteration. batch_indices: The indices used for batching. should_one_hot_encode: A boolean indicating if outputs should be one-hot encoded. device: The device (GPU) to use for tensors. Args: tokens: The input token sequence. vocab_size: The size of the vocabulary. batch_size: The size of each batch. context_window: The size of the context window. should_one_hot_encode: Whether to one-hot encode the outputs. """ def __init__( self, tokens: np.ndarray, vocab_size: int, batch_size: int, context_window: int, should_one_hot_encode: bool = False, ): self.tokens = tokens self.vocab_size = vocab_size self.batch_size = batch_size self.context_window = context_window self.is_batch = False self.as_tensor = False self._idx = 0 self.batch_indices = None self.should_one_hot_encode = should_one_hot_encode self.device = None def __len__(self): """Returns the length of the dataset based on batching configuration.""" return ( (len(self.tokens) - self.context_window - self.batch_size - 1) // self.batch_size if self.is_batch else len(self.tokens) - 1 ) def __getitem__(self, idx: int): """ Returns a batch or single item from the dataset. Args: idx: The index of the item or batch to retrieve. Returns: A tuple containing input and output sequences. """ if self.is_batch: start = idx * self.batch_size end = (idx + 1) * self.batch_size indices = self.batch_indices[start:end] batches = [ self.tokens[i : i + self.context_window + 1] for i in indices ] inputs = np.vstack([b[:-1] for b in batches]) outputs = np.vstack([b[1:] for b in batches]) else: start = idx * self.context_window end = (idx + 1) * self.context_window + 1 tokens = self.tokens[start:end] inputs = tokens[:-1] outputs = tokens[1:] if self.as_tensor: inputs = Tensor( inputs, requires_grad=False, name="inputs", is_batched=self.is_batch, dtype=outputs.dtype, ) outputs = Tensor( outputs, requires_grad=False, name="output", is_batched=self.is_batch, dtype=outputs.dtype, ) if self.device is not None: inputs.to_gpu(self.device) outputs.to_gpu(self.device) return inputs, outputs def __iter__(self): """Returns the dataset object as an iterator.""" self._idx = 0 return self def __next__(self): """ Returns the next item or batch in the dataset. Raises: StopIteration: If all items have been iterated over. """ if self._idx >= len(self): raise StopIteration result = self[self._idx] self._idx += 1 return result
[docs] def batch(self): """ Configures the dataset for batch processing. Returns: The dataset object configured for batch processing. """ print("batching") self.is_batch = True self.batch_indices = np.arange( len(self.tokens) - self.context_window - 1 ) return self
[docs] def unbatch(self): """ Configures the dataset for non-batch processing. Returns: The dataset object configured for non-batch processing. """ self.is_batch = False return self
[docs] def shuffle(self): """ Shuffles the batch indices. Returns: The dataset object with shuffled batch indices. Raises: NotImplementedError: If trying to shuffle a non-batched dataset. """ print("shuffling") if not self.is_batch and self.batch_indices is not None: raise NotImplementedError( "Shuffling non-batched datasets is not currently supported" ) else: n_batches = len(self.tokens) - self.context_window - 1 self.batch_indices = np.random.choice( n_batches, size=n_batches, replace=False ) return self
[docs] def to_gpu(self, device: int = 0): """ Sets the device for GPU processing. Args: device: The GPU device number. Returns: The dataset object configured for GPU processing. """ self.device = device return self
[docs] def from_gpu(self): """ Resets the device to CPU processing. Returns: The dataset object configured for CPU processing. """ self.device = None return self
[docs] def to_tensor(self): """ Configures the dataset to return tensors. Returns: The dataset object configured to return tensors. """ print("converting to tensor") self.as_tensor = True return self