Andromeda / testing /dataset_builder.py
kye's picture
Upload 73 files
ca4fc4d
raw
history blame contribute delete
No virus
1.8 kB
import unittest
from Andromeda.dataset_builder import DatasetBuilder
class TestDatasetBuilder(unittest.TestCase):
def setUp(self):
self.builder = DatasetBuilder(dataset_name="tiiuae/falcon-refinedweb")
def test_initialization(self):
self.assertEqual(self.builder.dataset_name, "tiiuae/falcon-refinedweb", "Dataset name is not correctly set.")
self.assertEqual(self.builder.seq_len, 8192, "Sequence length is not correctly set.")
self.assertEqual(self.builder.tokenizer, "EleutherAI/gpt-neox-20b", "Tokenizer is not correctly set.")
def test_build_dataset(self):
dataset = self.builder.build_dataset()
self.assertIsNotNone(dataset, "Dataset is not built.")
self.assertTrue(hasattr(dataset, "map"), "Dataset does not have a map method.")
def test_tokenize_function(self):
example = {"text": ["Hello, world!", "Andromeda is great."]}
tokenized_example = self.builder.tokenize_function(example)
self.assertIsInstance(tokenized_example, dict, "Tokenized example is not a dictionary.")
self.assertTrue(all(isinstance(t, list) for t in tokenized_example.values()), "Tokenized example values are not lists.")
def test_group_texts(self):
examples = {"input_ids": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]] * 10}
grouped_examples = self.builder.group_texts(examples)
self.assertIsInstance(grouped_examples, dict, "Grouped examples is not a dictionary.")
self.assertTrue(all(isinstance(t, list) for t in grouped_examples.values()), "Grouped example values are not lists.")
self.assertTrue(all(len(t) == self.builder.seq_len for t in grouped_examples["input_ids"]), "Grouped example sequences are not the correct length.")
if __name__ == '__main__':
unittest.main()