compress pretrain dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -360,7 +360,7 @@ datasets_configs = [
|
|
360 |
# {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
|
361 |
# {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
|
362 |
# {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
|
363 |
-
|
364 |
# code
|
365 |
# {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
|
366 |
*[
|
@@ -421,4 +421,5 @@ outputs = optimize(
|
|
421 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
422 |
chunk_size=(2049 * 8012),
|
423 |
num_workers=32,
|
|
|
424 |
)
|
|
|
360 |
# {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
|
361 |
# {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
|
362 |
# {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
|
363 |
+
|
364 |
# code
|
365 |
# {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
|
366 |
*[
|
|
|
421 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
422 |
chunk_size=(2049 * 8012),
|
423 |
num_workers=32,
|
424 |
+
compression='zstd',
|
425 |
)
|