tangledgroup
/

tangled-llama-t-128k-base-v0.1

@@ -42,15 +42,294 @@ datasets_configs = [
     {'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
     # multilingual
     *[
-        {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
         for data_dir in [
-            'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
-            'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
         ]
     ],
     *[
-        {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': '{text}'}
         for name in [
             'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
             'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
@@ -66,17 +345,12 @@ datasets_configs = [
             'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
             'zh-Hans', 'zh-Hant', 'zu',
         ]
     ],
-    # *[
-    #     {'path': 'Salesforce/wikitext', 'name': name, 'split': 'train+validation+test', 'format': '{text}'}
-    #     for name in [
-    #         'wikitext-103-raw-v1',
-    #         'wikitext-103-v1',
-    #         'wikitext-2-raw-v1',
-    #         'wikitext-2-v1',
-    #     ]
-    # ],
-    {'path': 'jordiclive/wikipedia-summary-dataset', 'format': '{summary}'},
     # {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
     # general
@@ -86,9 +360,9 @@ datasets_configs = [
     # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
     # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
     # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
     # code
-    {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
     *[
         {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
         for name in [
@@ -118,7 +392,10 @@ datasets_configs = [
     # math
     {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
-    {'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': '{text}'},
     {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
     {'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
     {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},

     {'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
     # multilingual
+    # *[
+    #     {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': f'train[{i}%:{i + 1}%]', 'format': '{instruction} {input} {output}'}
+    #     for data_dir in [
+    #         'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
+    #         'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
+    #     ]
+    #     for i in range(0, 100, 10)
+    # ],
     *[
+        {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': f'train', 'format': '{instruction} {input} {output}'}
         for data_dir in [
+            f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
+            for n in [
+                'Afrikaans',
+                'Albanian',
+                'Amharic',
+                'Arabic',
+                'Armenian',
+                'Assamese',
+                'Aymara',
+                'Azerbaijani',
+                'Bambara',
+                'Basque',
+                'Belarusian',
+                'Bengali',
+                'Bhojpuri',
+                'Bosnian',
+                'Bulgarian',
+                'Catalan',
+                'Cebuano',
+                'Chichewa',
+                'ChineseSimplified',
+                'ChineseTraditional',
+                'Corsican',
+                'Croatian',
+                'Czech',
+                'Danish',
+                'Divehi',
+                'Dogri',
+                'Dutch',
+                'Esperanto',
+                'Estonian',
+                'Ewe',
+                'Filipino',
+                'Finnish',
+                'French',
+                'Frisian',
+                'Galician',
+                'Georgian',
+                'German',
+                'Greek',
+                'Guarani',
+                'Gujarati',
+                'Haitian_Creole',
+                'Hausa',
+                'Hawaiian',
+                'Hebrew',
+                'Hindi',
+                'Hmong',
+                'Hungarian',
+                'Icelandic',
+                'Igbo',
+                'Ilocano',
+                'Indonesian',
+                'Irish',
+                'Italian',
+                'Japanese',
+                'Javanese',
+                'Kannada',
+                'Kazakh',
+                'Khmer',
+                'Kinyarwanda',
+                'Konkani',
+                'Korean',
+                'Krio',
+                'Kurdish_Kurmanji',
+                'Kurdish_Sorani',
+                'Kyrgyz',
+                'Lao',
+                'Latin',
+                'Latvian',
+                'Lingala',
+                'Lithuanian',
+                'Luganda',
+                'Luxembourgish',
+                'Macedonian',
+                'Maithili',
+                'Malagasy',
+                'Malay',
+                'Malayalam',
+                'Maltese',
+                'Maori',
+                'Marathi',
+                'Meiteilon_Manipuri',
+                'Mizo',
+                'Mongolian',
+                'Myanmar_Burmese',
+                'Nepali',
+                'Norwegian',
+                'Odia_Oriya',
+                'Oromo',
+                'Pashto',
+                'Persian',
+                'Polish',
+                'Portuguese',
+                'Punjabi',
+                'Quechua',
+                'Romanian',
+                'Russian',
+                'Samoan',
+                'Sanskrit',
+                'ScottishGaelic',
+                'Sepedi',
+                'Serbian',
+                'Sesotho',
+                'Shona',
+                'Sindhi',
+                'Sinhala',
+                'Slovak',
+                'Slovenian',
+                'Somali',
+                'Spanish',
+                'Sundanese',
+                'Swahili',
+                'Swedish',
+                'Tajik',
+                'Tamil',
+                'Tatar',
+                'Telugu',
+                'Thai',
+                'Tigrinya',
+                'Tsonga',
+                'Turkish',
+                'Turkmen',
+                'Twi',
+                'Ukrainian',
+                'Urdu',
+                'Uyghur',
+                'Uzbek',
+                'Vietnamese',
+                'Welsh',
+                'Xhosa',
+                'Yiddish',
+                'Yoruba',
+                'Zulu',
+            ]
+        ]
+    ],
+    *[
+        {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
+        for n in [
+            'Afrikaans.json',
+            'Albanian.json',
+            'Amharic.json',
+            'Arabic.json',
+            'Armenian.json',
+            'Assamese.json',
+            'Aymara.json',
+            'Azerbaijani.json',
+            'Bambara.json',
+            'Basque.json',
+            'Belarusian.json',
+            'Bengali.json',
+            'Bhojpuri.json',
+            'Bosnian.json',
+            'Bulgarian.json',
+            'Catalan.json',
+            'Cebuano.json',
+            'Chichewa.json',
+            'ChineseSimplified.json',
+            'ChineseTraditional.json',
+            'Corsican.json',
+            'Croatian.json',
+            'Czech.json',
+            'Danish.json',
+            'Dhivehi.json',
+            'Dogri.json',
+            'Dutch.json',
+            'English.json',
+            'Esperanto.json',
+            'Estonian.json',
+            'Ewe.json',
+            'Filipino.json',
+            'Finnish.json',
+            'French.json',
+            'Frisian.json',
+            'Galician.json',
+            'Georgian.json',
+            'German.json',
+            'Greek.json',
+            'Guarani.json',
+            'Gujarati.json',
+            'Haitian_Creole.json',
+            'Hausa.json',
+            'Hawaiian.json',
+            'Hebrew.json',
+            'Hindi.json',
+            'Hmong.json',
+            'Hungarian.json',
+            'Icelandic.json',
+            'Igbo.json',
+            'Ilocano.json',
+            'Indonesian.json',
+            'Irish.json',
+            'Italian.json',
+            'Japanese.json',
+            'Javanese.json',
+            'Kannada.json',
+            'Kazakh.json',
+            'Khmer.json',
+            'Kinyarwanda.json',
+            'Konkani.json',
+            'Korean.json',
+            'Krio.json',
+            'Kurdish_Kurmanji.json',
+            'Kurdish_Sorani.json',
+            'Kyrgyz.json',
+            'Lao.json',
+            'Latin.json',
+            'Latvian.json',
+            'Lingala.json',
+            'Lithuanian.json',
+            'Luganda.json',
+            'Luxembourgish.json',
+            'Macedonian.json',
+            'Maithili.json',
+            'Malagasy.json',
+            'Malayalam.json',
+            'Malay.json',
+            'Maltese.json',
+            'Maori.json',
+            'Marathi.json',
+            'Meiteilon_Manipuri.json',
+            'Mizo.json',
+            'Mongolian.json',
+            'Myanmar_Burmese.json',
+            'Nepali.json',
+            'Norwegian.json',
+            'Odia_Oriya.json',
+            'Oromo.json',
+            'Pashto.json',
+            'Persian.json',
+            'Polish.json',
+            'Portuguese.json',
+            'Punjabi.json',
+            'Quechua.json',
+            'Romanian.json',
+            'Russian.json',
+            'Samoan.json',
+            'Sanskrit.json',
+            'ScottishGaelic.json',
+            'Sepedi.json',
+            'Serbian.json',
+            'Sesotho.json',
+            'Shona.json',
+            'Sindhi.json',
+            'Sinhala.json',
+            'Slovak.json',
+            'Slovenian.json',
+            'Somali.json',
+            'Spanish.json',
+            'Sundanese.json',
+            'Swahili.json',
+            'Swedish.json',
+            'Tajik.json',
+            'Tamil.json',
+            'Tatar.json',
+            'Telugu.json',
+            'Thai.json',
+            'Tigrinya.json',
+            'Tsonga.json',
+            'Turkish.json',
+            'Turkmen.json',
+            'Twi.json',
+            'Ukrainian.json',
+            'Urdu.json',
+            'Uyghur.json',
+            'Uzbek.json',
+            'Vietnamese.json',
+            'Welsh.json',
+            'Xhosa.json',
+            'Yiddish.json',
+            'Yoruba.json',
+            'Zulu.json',
         ]
     ],
     *[
+        {'path': 'xu-song/cc100-samples', 'name': name, 'split': f'train[{i}%:{i + 1}%]', 'format': '{text}'}
         for name in [
             'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
             'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
             'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
             'zh-Hans', 'zh-Hant', 'zu',
         ]
+        for i in range(0, 100, 10)
+    ],
+    *[
+        {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': '{summary}'}
+        for i in range(0, 100, 5)
     ],
     # {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
     # general
     # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
     # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
     # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
     # code
+    # {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
     *[
         {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
         for name in [
     # math
     {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
+    *[
+        {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 1}%]', 'format': '{text}'}
+        for i in range(0, 100, 20)
+    ],
     {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
     {'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
     {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},

scripts/pretrain-model.yaml CHANGED Viewed

@@ -8,10 +8,10 @@ model_config:
   padded_vocab_size: 38400
   vocab_size: 38400
   block_size: 8192
-  n_layer: 5
   n_head: 32
   head_size: null
-  n_embd: 1024
   n_query_groups: 8
   rotary_percentage: 1.0
   parallel_residual: false
@@ -19,7 +19,7 @@ model_config:
   norm_class_name: "RMSNorm"
   norm_eps: 1e-05
   mlp_class_name: "LLaMAMLP"
-  intermediate_size: 4096
   rope_base: 500000
   # rope_adjustments:
   #   factor: 32.0
@@ -77,17 +77,16 @@ train:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
   # max_tokens: 3000000000000
-  # max_tokens: 8159107755 # 796399 * 2049 * 5
-  max_tokens: 11422750857 # 796399 * 2049 * 7
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
-  max_seq_length:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
-  tie_embeddings:
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
@@ -121,11 +120,9 @@ optimizer:
   init_args:
     #   (type: float, default: 0.001)
-    # lr: 1e-3
-    lr: 1e-4
     #   (type: float, default: 0.01)
-    # weight_decay: 0.01
     weight_decay: 0.1
     #   (type: tuple, default: (0.9,0.999))

   padded_vocab_size: 38400
   vocab_size: 38400
   block_size: 8192
+  n_layer: 32
   n_head: 32
   head_size: null
+  n_embd: 256
   n_query_groups: 8
   rotary_percentage: 1.0
   parallel_residual: false
   norm_class_name: "RMSNorm"
   norm_eps: 1e-05
   mlp_class_name: "LLaMAMLP"
+  intermediate_size: 1024
   rope_base: 500000
   # rope_adjustments:
   #   factor: 32.0
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
   # max_tokens: 3000000000000
+  max_tokens: 8159107755 # 796399 * 2049 * 5
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 2049
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
+  tie_embeddings: true
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
   init_args:
     #   (type: float, default: 0.001)
+    lr: 1e-3
     #   (type: float, default: 0.01)
     weight_decay: 0.1
     #   (type: tuple, default: (0.9,0.999))