tangledgroup
/

tangled-llama-v-128k-base-v0.1

@@ -165,6 +165,17 @@ def batch_iterator(name=None):
         del dataset
         gc.collect()
     # math
     if name in (None, 'ajibawa-2023/Maths-College'):
         dataset = load_dataset(name, split='train')
@@ -215,7 +226,7 @@ def tokenize_fn(dataset_name, tokenizer=None):
 datasets_names = [
     'saillab/taco-datasets',
     'xu-song/cc100-samples',
-    'ontocord/fineweb-permissive-multilingual-2m',
     'nampdn-ai/tiny-textbooks',
     'nampdn-ai/tiny-codes',
     'bigcode/the-stack-smol-xs',
@@ -223,10 +234,9 @@ datasets_names = [
     'jtatman/python-code-dataset-500k',
     'iamtarun/python_code_instructions_18k_alpaca',
     'HuggingFaceH4/CodeAlpaca_20K',
-    'gair-prox/open-web-math-pro',
     'ajibawa-2023/Maths-College',
     'microsoft/orca-math-word-problems-200k',
-    'datatab/orca_math_world_problem_200k_serbian',
     'badrex/llm-emoji-dataset',
 ]

         del dataset
         gc.collect()
+    # math
+    if name in (None, 'rvv-karma/Math-QA'):
+        for split in ['train', 'val', 'test']:
+            dataset = load_dataset(name, split=split)
+            for row in dataset:
+                yield row['question'] + '\n' + row['answer']
+            del dataset
+            gc.collect()
     # math
     if name in (None, 'ajibawa-2023/Maths-College'):
         dataset = load_dataset(name, split='train')
 datasets_names = [
     'saillab/taco-datasets',
     'xu-song/cc100-samples',
+    # 'ontocord/fineweb-permissive-multilingual-2m',
     'nampdn-ai/tiny-textbooks',
     'nampdn-ai/tiny-codes',
     'bigcode/the-stack-smol-xs',
     'jtatman/python-code-dataset-500k',
     'iamtarun/python_code_instructions_18k_alpaca',
     'HuggingFaceH4/CodeAlpaca_20K',
+    # 'gair-prox/open-web-math-pro',
     'ajibawa-2023/Maths-College',
     'microsoft/orca-math-word-problems-200k',
     'badrex/llm-emoji-dataset',
 ]