mtasic85 commited on
Commit
b2e9443
1 Parent(s): 08c9a42

compress pretrain dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -360,7 +360,7 @@ datasets_configs = [
360
  # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
361
  # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
362
  # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
363
-
364
  # code
365
  # {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
366
  *[
@@ -421,4 +421,5 @@ outputs = optimize(
421
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
422
  chunk_size=(2049 * 8012),
423
  num_workers=32,
 
424
  )
 
360
  # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
361
  # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
362
  # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
363
+
364
  # code
365
  # {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
366
  *[
 
421
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
422
  chunk_size=(2049 * 8012),
423
  num_workers=32,
424
+ compression='zstd',
425
  )