[docs]classDataset:""" An in-memory dataset: not suitable for large datasets. This class represents a basic dataset with inputs and corresponding outputs. It supports iteration, indexing, and shuffling of data. Attributes: inputs: A sequence of input data. outputs: A sequence of output data corresponding to the inputs. _indices: A list of indices for accessing data. _index: The current index for iteration. Args: inputs: A sequence of input data. outputs: A sequence of output data. Raises: AssertionError: If the length of inputs and outputs are not equal. """def__init__(self,inputs:Sequence,outputs:Sequence):assertlen(inputs)==len(outputs)self.inputs=inputsself.outputs=outputsself._indices=list(range(len(inputs)))self._index=0def__iter__(self):"""Returns the dataset object as an iterator."""returnselfdef__next__(self):""" Returns the next item in the dataset. Raises: StopIteration: If all items have been iterated over. """ifself._index>=len(self.inputs):raiseStopIterationresult=self[self._index]self._index+=1returnresultdef__len__(self):"""Returns the number of items in the dataset."""returnlen(self.inputs)def__getitem__(self,idx:int):""" Returns the item at the specified index. Args: idx: The index of the item to retrieve. Returns: A tuple containing the input and output at the specified index. """idx=self._indices[idx]returnself.inputs[idx],self.outputs[idx]
[docs]defshuffle(self):""" Shuffles the dataset indices. Returns: The dataset object with shuffled indices. """np.random.shuffle(self._indices)returnself
[docs]defto_tensor(self):""" Converts inputs and outputs to Tensor objects. Returns: The dataset object with inputs and outputs as Tensors. """self.inputs=[Tensor(x)forxinself.inputs]self.outputs=[Tensor(x)forxinself.outputs]returnself
[docs]defreset(self):""" Resets the iteration index to 0. Returns: The dataset object with reset index. """self._index=0returnself
[docs]defcopy(self):""" Creates a shallow copy of the dataset. Returns: A new Dataset object with copied inputs and outputs. """returnDataset(self.inputs.copy(),self.outputs.copy())
[docs]classInfiniteBatchDataset(Dataset):""" An infinite batch dataset that generates random batches. This class extends the Dataset class to provide infinite batches of data. It randomly selects items from the dataset to form batches. Attributes: is_infinite: A boolean indicating if the dataset is infinite. _to_tensor: A boolean indicating if the data should be converted to tensors. is_batched: A boolean indicating if the data is batched. batch_size: The size of each batch. Args: inputs: A sequence of input data. outputs: A sequence of output data. batch_size: The size of each batch. """is_infinite=True_to_tensor=Falseis_batched=Truedef__init__(self,inputs:Sequence,outputs:Sequence,batch_size:int):super().__init__(inputs,outputs)self.batch_size=batch_sizedef__next__(self):"""Returns the next batch of items."""result=self[self._index]self._index+=1returnresultdef__len__(self):"""Returns -1 to indicate an infinite dataset."""return-1def__getitem__(self,idx:int):""" Returns a randomly generated batch of items. Args: idx: The index used as a seed for random generation. Returns: A tuple containing batches of inputs and outputs. """random.seed(idx)indices=[random.randint(0,len(self.inputs)-1)for_inrange(self.batch_size)]batch_inputs=np.vstack([self.inputs[i]foriinindices])batch_outputs=np.vstack([self.outputs[i]foriinindices])ifself._to_tensor:batch_inputs=Tensor(batch_inputs,is_batched=self.is_batched,dtype=batch_outputs.dtype,)batch_outputs=Tensor(batch_outputs,is_batched=self.is_batched,dtype=batch_outputs.dtype,)returnbatch_inputs,batch_outputs
[docs]defto_tensor(self):""" Sets the flag to convert data to tensors. Returns: The dataset object with _to_tensor flag set to True. """self._to_tensor=Truereturnself
[docs]classCausalLMDataset:""" A dataset for causal language modeling tasks. This class provides functionality for creating batches of token sequences for training causal language models. Attributes: tokens: The input token sequence. vocab_size: The size of the vocabulary. batch_size: The size of each batch. context_window: The size of the context window. is_batch: A boolean indicating if the data is batched. as_tensor: A boolean indicating if the data should be returned as tensors. _idx: The current index for iteration. batch_indices: The indices used for batching. should_one_hot_encode: A boolean indicating if outputs should be one-hot encoded. device: The device (GPU) to use for tensors. Args: tokens: The input token sequence. vocab_size: The size of the vocabulary. batch_size: The size of each batch. context_window: The size of the context window. should_one_hot_encode: Whether to one-hot encode the outputs. """def__init__(self,tokens:np.ndarray,vocab_size:int,batch_size:int,context_window:int,should_one_hot_encode:bool=False,):self.tokens=tokensself.vocab_size=vocab_sizeself.batch_size=batch_sizeself.context_window=context_windowself.is_batch=Falseself.as_tensor=Falseself._idx=0self.batch_indices=Noneself.should_one_hot_encode=should_one_hot_encodeself.device=Nonedef__len__(self):"""Returns the length of the dataset based on batching configuration."""return((len(self.tokens)-self.context_window-self.batch_size-1)//self.batch_sizeifself.is_batchelselen(self.tokens)-1)def__getitem__(self,idx:int):""" Returns a batch or single item from the dataset. Args: idx: The index of the item or batch to retrieve. Returns: A tuple containing input and output sequences. """ifself.is_batch:start=idx*self.batch_sizeend=(idx+1)*self.batch_sizeindices=self.batch_indices[start:end]batches=[self.tokens[i:i+self.context_window+1]foriinindices]inputs=np.vstack([b[:-1]forbinbatches])outputs=np.vstack([b[1:]forbinbatches])else:start=idx*self.context_windowend=(idx+1)*self.context_window+1tokens=self.tokens[start:end]inputs=tokens[:-1]outputs=tokens[1:]ifself.as_tensor:inputs=Tensor(inputs,requires_grad=False,name="inputs",is_batched=self.is_batch,dtype=outputs.dtype,)outputs=Tensor(outputs,requires_grad=False,name="output",is_batched=self.is_batch,dtype=outputs.dtype,)ifself.deviceisnotNone:inputs.to_gpu(self.device)outputs.to_gpu(self.device)returninputs,outputsdef__iter__(self):"""Returns the dataset object as an iterator."""self._idx=0returnselfdef__next__(self):""" Returns the next item or batch in the dataset. Raises: StopIteration: If all items have been iterated over. """ifself._idx>=len(self):raiseStopIterationresult=self[self._idx]self._idx+=1returnresult
[docs]defbatch(self):""" Configures the dataset for batch processing. Returns: The dataset object configured for batch processing. """print("batching")self.is_batch=Trueself.batch_indices=np.arange(len(self.tokens)-self.context_window-1)returnself
[docs]defunbatch(self):""" Configures the dataset for non-batch processing. Returns: The dataset object configured for non-batch processing. """self.is_batch=Falsereturnself
[docs]defshuffle(self):""" Shuffles the batch indices. Returns: The dataset object with shuffled batch indices. Raises: NotImplementedError: If trying to shuffle a non-batched dataset. """print("shuffling")ifnotself.is_batchandself.batch_indicesisnotNone:raiseNotImplementedError("Shuffling non-batched datasets is not currently supported")else:n_batches=len(self.tokens)-self.context_window-1self.batch_indices=np.random.choice(n_batches,size=n_batches,replace=False)returnself
[docs]defto_gpu(self,device:int=0):""" Sets the device for GPU processing. Args: device: The GPU device number. Returns: The dataset object configured for GPU processing. """self.device=devicereturnself
[docs]deffrom_gpu(self):""" Resets the device to CPU processing. Returns: The dataset object configured for CPU processing. """self.device=Nonereturnself
[docs]defto_tensor(self):""" Configures the dataset to return tensors. Returns: The dataset object configured to return tensors. """print("converting to tensor")self.as_tensor=Truereturnself