pretrain model
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -165,6 +165,17 @@ def batch_iterator(name=None):
|
|
165 |
del dataset
|
166 |
gc.collect()
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
# math
|
169 |
if name in (None, 'ajibawa-2023/Maths-College'):
|
170 |
dataset = load_dataset(name, split='train')
|
@@ -215,7 +226,7 @@ def tokenize_fn(dataset_name, tokenizer=None):
|
|
215 |
datasets_names = [
|
216 |
'saillab/taco-datasets',
|
217 |
'xu-song/cc100-samples',
|
218 |
-
'ontocord/fineweb-permissive-multilingual-2m',
|
219 |
'nampdn-ai/tiny-textbooks',
|
220 |
'nampdn-ai/tiny-codes',
|
221 |
'bigcode/the-stack-smol-xs',
|
@@ -223,10 +234,9 @@ datasets_names = [
|
|
223 |
'jtatman/python-code-dataset-500k',
|
224 |
'iamtarun/python_code_instructions_18k_alpaca',
|
225 |
'HuggingFaceH4/CodeAlpaca_20K',
|
226 |
-
'gair-prox/open-web-math-pro',
|
227 |
'ajibawa-2023/Maths-College',
|
228 |
'microsoft/orca-math-word-problems-200k',
|
229 |
-
'datatab/orca_math_world_problem_200k_serbian',
|
230 |
'badrex/llm-emoji-dataset',
|
231 |
]
|
232 |
|
|
|
165 |
del dataset
|
166 |
gc.collect()
|
167 |
|
168 |
+
# math
|
169 |
+
if name in (None, 'rvv-karma/Math-QA'):
|
170 |
+
for split in ['train', 'val', 'test']:
|
171 |
+
dataset = load_dataset(name, split=split)
|
172 |
+
|
173 |
+
for row in dataset:
|
174 |
+
yield row['question'] + '\n' + row['answer']
|
175 |
+
|
176 |
+
del dataset
|
177 |
+
gc.collect()
|
178 |
+
|
179 |
# math
|
180 |
if name in (None, 'ajibawa-2023/Maths-College'):
|
181 |
dataset = load_dataset(name, split='train')
|
|
|
226 |
datasets_names = [
|
227 |
'saillab/taco-datasets',
|
228 |
'xu-song/cc100-samples',
|
229 |
+
# 'ontocord/fineweb-permissive-multilingual-2m',
|
230 |
'nampdn-ai/tiny-textbooks',
|
231 |
'nampdn-ai/tiny-codes',
|
232 |
'bigcode/the-stack-smol-xs',
|
|
|
234 |
'jtatman/python-code-dataset-500k',
|
235 |
'iamtarun/python_code_instructions_18k_alpaca',
|
236 |
'HuggingFaceH4/CodeAlpaca_20K',
|
237 |
+
# 'gair-prox/open-web-math-pro',
|
238 |
'ajibawa-2023/Maths-College',
|
239 |
'microsoft/orca-math-word-problems-200k',
|
|
|
240 |
'badrex/llm-emoji-dataset',
|
241 |
]
|
242 |
|