Andromeda / build_dataset.py
kye's picture
Upload 73 files
ca4fc4d
raw
history blame contribute delete
No virus
2.81 kB
import multiprocessing
from itertools import chain
from datasets import load_dataset
from transformers import AutoTokenizer
from huggingface_hub import HfApi
class DatasetBuilder:
def __init__(
self,
dataset_name,
seq_len=8192,
num_cpu=None,
hf_account_repo=None,
tokenizer="EleutherAI/gpt-neox-20b",
):
self.dataset_name = dataset_name
self.seq_len = seq_len
self.num_cpu = num_cpu or multiprocessing.cpu_count()
self.hf_account_repo = hf_account_repo
self.tokenizer = tokenizer
def build_dataset(self):
tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
train_dataset = load_dataset(self.dataset_name, split="train", streaming=True)
dataset = train_dataset.shuffle()
def tokenize_function(example):
return tokenizer([t + tokenizer.eos_token for t in example["text"]])
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
# num_proc=self.num_cpu,
remove_columns=["text"],
#num_proc=32
)
block_size = self.seq_len
def group_texts(examples):
concatenated_examples = {
k: list(chain(*examples[k])) for k in examples.keys()
}
total_length = len(concatenated_examples[list(examples.keys())[0]])
if total_length >= block_size:
total_length = (total_length // block_size) * block_size
result = {
k: [
t[i : i + block_size]
for i in range(0, total_length, block_size)
]
for k, t in concatenated_examples.items()
}
return result
train_tokenized_dataset = tokenized_dataset.map(
group_texts, batched=True, #num_proc=32#num_proc=self.num_cpu
)
#TODO: ValueError: path_or_fileobj must be either an instance of str, bytes or io.BufferedIOBase. If you passed a file-like object, make sure it is in binary mode.
if self.hf_account_repo:
# train_tokenized_dataset.push_to_hub(self.hf_account_repo, private=True)
hf_api = HfApi()
hf_api.upload_file(
path_or_fileobj= "TOKENIZED_DATASET",#train_tokenized_dataset, #path to local space
path_in_repo="README.md",
repo_id=self.hf_account_repo,
repo_type="dataset"
)
return train_tokenized_dataset
builder = DatasetBuilder(
dataset_name="the_pile_books3",
seq_len=8192,
# num_cpu=4,
hf_account_repo="kye/thepilebooks3-gptneox-8k",
tokenizer="EleutherAI/gpt-neox-20b",
)
dataset = builder.build_dataset()