"""Prepares and manages web text data from Fineweb.This module provides functionality to download, tokenize, and manage thefineweb dataset. It includes utilities for data preparation and a customdataset class for efficient data loading.Typical usage example: dataset = FineWeb(vocab_size=50257, split='train') tokens = dataset[0:1000] # Get the first 1000 tokens"""importosfromcollectionsimportabcfrompathlibimportPathfromtypingimportLiteralimportnumpyasnpimporttiktokenfromtqdm.autoimporttqdmimportdatasetsfromdatasetsimportload_datasetN_CORES=os.cpu_count()SAVE_DIR=Path("datasets/fineweb")SAVE_DIR.mkdir(exist_ok=True,parents=True)DTYPE=np.uint16tokeniser=tiktoken.get_encoding("gpt2")
[docs]deftokenise_document(example):"""Tokenizes a single document from the dataset. Args: example: A dictionary containing the 'text' field to be tokenized. Returns: A dictionary with 'ids' (tokenized text) and 'len' (number of tokens). """ids=tokeniser.encode_ordinary(example["text"])# encode_ordinary ignores any special tokensids.append(tokeniser.eot_token)# add the end of text tokenout={"ids":ids,"len":len(ids)}returnout
[docs]defprepare_data():"""Downloads and tokenizes the coreparrot dataset. This function is adapted from Andrej Karpathy's NanoGPT: https://github.com/karpathy/nanoGPT/blob/master/data/openwebtext/prepare.py The function performs the following steps: 1. Loads the dataset 2. Splits it into train and validation sets 3. Tokenizes the dataset 4. Saves the tokenized data to binary files Note: This function uses OpenAI's tiktoken for tokenization due to performance considerations. """datasets.disable_caching()dataset=load_dataset("HuggingFaceFW/fineweb",name="sample-10BT",split="train")split_dataset=dataset.train_test_split(test_size=0.0005,seed=2357,shuffle=True)split_dataset["valid"]=split_dataset.pop("test")# rename the test split to val# tokenise the datasettokenised=split_dataset.map(tokenise_document,remove_columns=["text"],desc="Tokenising",num_proc=N_CORES,)# concatenate all the ids in each dataset into one large file we can use# for trainingforsplit,dsetintokenised.items():filename=SAVE_DIR/f"{split}.bin"n_tokens=np.sum(dset["len"])print(f"Found: {n_tokens}{split} tokens")arr=np.memmap(filename,dtype=DTYPE,mode="w+",shape=(n_tokens,))total_batches=1024idx=0forbatch_idxintqdm(range(total_batches),desc=f"writing {filename}"):# Batch together samples for faster writebatch=dset.shard(num_shards=total_batches,index=batch_idx,contiguous=True).with_format("numpy")arr_batch=np.concatenate(batch["ids"])# Write into mmaparr[idx:idx+len(arr_batch)]=arr_batchidx+=len(arr_batch)arr.flush()
[docs]classFineWeb(abc.Sequence):"""A custom dataset class for efficient loading of tokenized fineweb data. This class provides an interface to access tokenized fineweb data, supporting indexing and length operations. It also includes methods for encoding and decoding tokens. Attributes: vocab_size: An integer representing the vocabulary size. token_path: A Path object pointing to the tokenized data file. tokeniser_string: A string specifying the tokenizer to use (default: "gpt2"). tokens: A numpy memmap of the tokenized data. Args: vocab_size: An integer specifying the vocabulary size. split: A string literal, either "train" or "valid", specifying the dataset split. token_path: An optional Path object for the tokenized data file. Raises: ValueError: If the tokenizer's max token value doesn't match the specified vocab size. """vocab_size:inttoken_path:Pathtokeniser_string:str="gpt2"tokens:np.ndarraydef__init__(self,vocab_size:int,split:Literal["train"]|Literal["valid"],token_path:Path|None=None,):self.vocab_size=vocab_sizeself.tokeniser=tiktoken.get_encoding(self.tokeniser_string)ifself.tokeniser.max_token_value!=vocab_size:raiseValueError("Expected tokeniser.max_token_value == vocab_size. Found "f"{self.tokeniser.max_token_value=}, {vocab_size=}")iftoken_pathisNone:self.token_path=SAVE_DIR/f"{split}.bin"else:self.token_path=token_pathifnotself.token_path.exists():prepare_data()assertself.token_path.exists()self.tokens=np.memmap(self.token_path,dtype=DTYPE,mode="r")def__getitem__(self,key):"""Retrieves token(s) at the specified index or slice. Args: key: An integer index or slice object. Returns: The token(s) at the specified index or slice. """returnself.tokens[key]def__len__(self):"""Returns the total number of tokens in the dataset. Returns: An integer representing the number of tokens. """returnlen(self.tokens)
[docs]defencode(self,*args):"""Encodes the input text into tokens. Args: *args: Variable length argument list to be passed to the tokenizer. Returns: A list of integer token IDs. """returnself.tokeniser.encode_ordinary(*args)
[docs]defdecode(self,*args):"""Decodes the input tokens into text. Args: *args: Variable length argument list to be passed to the tokenizer. Returns: A string of decoded text. """returnself.tokeniser.decode(*args)