File size: 2,807 Bytes
ca4fc4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import multiprocessing
from itertools import chain
from datasets import load_dataset
from transformers import AutoTokenizer

from huggingface_hub import HfApi


class DatasetBuilder:
    def __init__(
        self,
        dataset_name,
        seq_len=8192,
        num_cpu=None,
        hf_account_repo=None,
        tokenizer="EleutherAI/gpt-neox-20b",
    ):
        self.dataset_name = dataset_name
        self.seq_len = seq_len
        self.num_cpu = num_cpu or multiprocessing.cpu_count()
        self.hf_account_repo = hf_account_repo
        self.tokenizer = tokenizer

    def build_dataset(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
        train_dataset = load_dataset(self.dataset_name, split="train", streaming=True)
        dataset = train_dataset.shuffle()

        def tokenize_function(example):
            return tokenizer([t + tokenizer.eos_token for t in example["text"]])

        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            # num_proc=self.num_cpu,
            remove_columns=["text"],
            #num_proc=32
        )

        block_size = self.seq_len

        def group_texts(examples):
            concatenated_examples = {
                k: list(chain(*examples[k])) for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])

            if total_length >= block_size:
                total_length = (total_length // block_size) * block_size

            result = {
                k: [
                    t[i : i + block_size]
                    for i in range(0, total_length, block_size)
                ]
                for k, t in concatenated_examples.items()
            }

            return result

        train_tokenized_dataset = tokenized_dataset.map(
            group_texts, batched=True, #num_proc=32#num_proc=self.num_cpu
        )

        #TODO: ValueError: path_or_fileobj must be either an instance of str, bytes or io.BufferedIOBase. If you passed a file-like object, make sure it is in binary mode.
        if self.hf_account_repo:
            # train_tokenized_dataset.push_to_hub(self.hf_account_repo, private=True)
            hf_api = HfApi()
            hf_api.upload_file(
                path_or_fileobj= "TOKENIZED_DATASET",#train_tokenized_dataset, #path to local space
                path_in_repo="README.md",
                repo_id=self.hf_account_repo,
                repo_type="dataset"
            )

        return train_tokenized_dataset

            

builder = DatasetBuilder(
    dataset_name="the_pile_books3",
    seq_len=8192,
    # num_cpu=4,
    hf_account_repo="kye/thepilebooks3-gptneox-8k",
    tokenizer="EleutherAI/gpt-neox-20b",
)

dataset = builder.build_dataset()