mtasic85 commited on
Commit
7854677
1 Parent(s): 400c392

pretrain model

Browse files
Files changed (1) hide show
  1. scripts/prepare_pretrain_dataset.py +13 -3
scripts/prepare_pretrain_dataset.py CHANGED
@@ -165,6 +165,17 @@ def batch_iterator(name=None):
165
  del dataset
166
  gc.collect()
167
 
 
 
 
 
 
 
 
 
 
 
 
168
  # math
169
  if name in (None, 'ajibawa-2023/Maths-College'):
170
  dataset = load_dataset(name, split='train')
@@ -215,7 +226,7 @@ def tokenize_fn(dataset_name, tokenizer=None):
215
  datasets_names = [
216
  'saillab/taco-datasets',
217
  'xu-song/cc100-samples',
218
- 'ontocord/fineweb-permissive-multilingual-2m',
219
  'nampdn-ai/tiny-textbooks',
220
  'nampdn-ai/tiny-codes',
221
  'bigcode/the-stack-smol-xs',
@@ -223,10 +234,9 @@ datasets_names = [
223
  'jtatman/python-code-dataset-500k',
224
  'iamtarun/python_code_instructions_18k_alpaca',
225
  'HuggingFaceH4/CodeAlpaca_20K',
226
- 'gair-prox/open-web-math-pro',
227
  'ajibawa-2023/Maths-College',
228
  'microsoft/orca-math-word-problems-200k',
229
- 'datatab/orca_math_world_problem_200k_serbian',
230
  'badrex/llm-emoji-dataset',
231
  ]
232
 
 
165
  del dataset
166
  gc.collect()
167
 
168
+ # math
169
+ if name in (None, 'rvv-karma/Math-QA'):
170
+ for split in ['train', 'val', 'test']:
171
+ dataset = load_dataset(name, split=split)
172
+
173
+ for row in dataset:
174
+ yield row['question'] + '\n' + row['answer']
175
+
176
+ del dataset
177
+ gc.collect()
178
+
179
  # math
180
  if name in (None, 'ajibawa-2023/Maths-College'):
181
  dataset = load_dataset(name, split='train')
 
226
  datasets_names = [
227
  'saillab/taco-datasets',
228
  'xu-song/cc100-samples',
229
+ # 'ontocord/fineweb-permissive-multilingual-2m',
230
  'nampdn-ai/tiny-textbooks',
231
  'nampdn-ai/tiny-codes',
232
  'bigcode/the-stack-smol-xs',
 
234
  'jtatman/python-code-dataset-500k',
235
  'iamtarun/python_code_instructions_18k_alpaca',
236
  'HuggingFaceH4/CodeAlpaca_20K',
237
+ # 'gair-prox/open-web-math-pro',
238
  'ajibawa-2023/Maths-College',
239
  'microsoft/orca-math-word-problems-200k',
 
240
  'badrex/llm-emoji-dataset',
241
  ]
242