Source code for tricycle_datasets.fineweb

"""Prepares and manages web text data from Fineweb.

This module provides functionality to download, tokenize, and manage the
fineweb dataset. It includes utilities for data preparation and a custom
dataset class for efficient data loading.

Typical usage example:

  dataset = FineWeb(vocab_size=50257, split='train')
  tokens = dataset[0:1000]  # Get the first 1000 tokens
"""

import os
from collections import abc
from pathlib import Path
from typing import Literal

import numpy as np
import tiktoken
from tqdm.auto import tqdm

import datasets
from datasets import load_dataset

N_CORES = os.cpu_count()
SAVE_DIR = Path("datasets/fineweb")
SAVE_DIR.mkdir(exist_ok=True, parents=True)
DTYPE = np.uint16


tokeniser = tiktoken.get_encoding("gpt2")


[docs] def tokenise_document(example): """Tokenizes a single document from the dataset. Args: example: A dictionary containing the 'text' field to be tokenized. Returns: A dictionary with 'ids' (tokenized text) and 'len' (number of tokens). """ ids = tokeniser.encode_ordinary( example["text"] ) # encode_ordinary ignores any special tokens ids.append(tokeniser.eot_token) # add the end of text token out = {"ids": ids, "len": len(ids)} return out
[docs] def prepare_data(): """Downloads and tokenizes the coreparrot dataset. This function is adapted from Andrej Karpathy's NanoGPT: https://github.com/karpathy/nanoGPT/blob/master/data/openwebtext/prepare.py The function performs the following steps: 1. Loads the dataset 2. Splits it into train and validation sets 3. Tokenizes the dataset 4. Saves the tokenized data to binary files Note: This function uses OpenAI's tiktoken for tokenization due to performance considerations. """ datasets.disable_caching() dataset = load_dataset( "HuggingFaceFW/fineweb", name="sample-10BT", split="train" ) split_dataset = dataset.train_test_split( test_size=0.0005, seed=2357, shuffle=True ) split_dataset["valid"] = split_dataset.pop( "test" ) # rename the test split to val # tokenise the dataset tokenised = split_dataset.map( tokenise_document, remove_columns=["text"], desc="Tokenising", num_proc=N_CORES, ) # concatenate all the ids in each dataset into one large file we can use # for training for split, dset in tokenised.items(): filename = SAVE_DIR / f"{split}.bin" n_tokens = np.sum(dset["len"]) print(f"Found: {n_tokens} {split} tokens") arr = np.memmap(filename, dtype=DTYPE, mode="w+", shape=(n_tokens,)) total_batches = 1024 idx = 0 for batch_idx in tqdm( range(total_batches), desc=f"writing {filename}" ): # Batch together samples for faster write batch = dset.shard( num_shards=total_batches, index=batch_idx, contiguous=True ).with_format("numpy") arr_batch = np.concatenate(batch["ids"]) # Write into mmap arr[idx : idx + len(arr_batch)] = arr_batch idx += len(arr_batch) arr.flush()
[docs] class FineWeb(abc.Sequence): """A custom dataset class for efficient loading of tokenized fineweb data. This class provides an interface to access tokenized fineweb data, supporting indexing and length operations. It also includes methods for encoding and decoding tokens. Attributes: vocab_size: An integer representing the vocabulary size. token_path: A Path object pointing to the tokenized data file. tokeniser_string: A string specifying the tokenizer to use (default: "gpt2"). tokens: A numpy memmap of the tokenized data. Args: vocab_size: An integer specifying the vocabulary size. split: A string literal, either "train" or "valid", specifying the dataset split. token_path: An optional Path object for the tokenized data file. Raises: ValueError: If the tokenizer's max token value doesn't match the specified vocab size. """ vocab_size: int token_path: Path tokeniser_string: str = "gpt2" tokens: np.ndarray def __init__( self, vocab_size: int, split: Literal["train"] | Literal["valid"], token_path: Path | None = None, ): self.vocab_size = vocab_size self.tokeniser = tiktoken.get_encoding(self.tokeniser_string) if self.tokeniser.max_token_value != vocab_size: raise ValueError( "Expected tokeniser.max_token_value == vocab_size. Found " f"{self.tokeniser.max_token_value=}, {vocab_size=}" ) if token_path is None: self.token_path = SAVE_DIR / f"{split}.bin" else: self.token_path = token_path if not self.token_path.exists(): prepare_data() assert self.token_path.exists() self.tokens = np.memmap(self.token_path, dtype=DTYPE, mode="r") def __getitem__(self, key): """Retrieves token(s) at the specified index or slice. Args: key: An integer index or slice object. Returns: The token(s) at the specified index or slice. """ return self.tokens[key] def __len__(self): """Returns the total number of tokens in the dataset. Returns: An integer representing the number of tokens. """ return len(self.tokens)
[docs] def encode(self, *args): """Encodes the input text into tokens. Args: *args: Variable length argument list to be passed to the tokenizer. Returns: A list of integer token IDs. """ return self.tokeniser.encode_ordinary(*args)
[docs] def decode(self, *args): """Decodes the input tokens into text. Args: *args: Variable length argument list to be passed to the tokenizer. Returns: A string of decoded text. """ return self.tokeniser.decode(*args)