mtasic85 commited on
Commit
8724fe8
1 Parent(s): 3e75c69

contrain dataset

Browse files
README.md CHANGED
@@ -34,7 +34,7 @@ tags:
34
  - litdata
35
  ---
36
 
37
- # tangled-llama-a-32k-base-v0.1
38
 
39
  ![logo](./misc/logo.png)
40
 
 
34
  - litdata
35
  ---
36
 
37
+ # tangled-llama-a-128k-base-v0.1
38
 
39
  ![logo](./misc/logo.png)
40
 
scripts/prepare_contrain_0_lang_math_dataset.py DELETED
@@ -1,195 +0,0 @@
1
- from typing import Optional, Union
2
- from functools import partial
3
-
4
- import numpy as np
5
- from datasets import load_dataset
6
- from litdata import optimize, TokensLoader
7
- from litgpt.tokenizer import Tokenizer
8
-
9
-
10
- def batch_dict_iterator(path: str,
11
- name: Optional[str]=None,
12
- data_dir: Optional[str]=None,
13
- data_files: Optional[str]=None,
14
- keep_in_memory: bool=False,
15
- revision: Optional[str]=None,
16
- split: str='train',
17
- num_proc: Optional[int]=None,
18
- format: Optional[str]=None):
19
- assert isinstance(format, str) or callable(format)
20
-
21
- dataset = load_dataset(path=path,
22
- name=name,
23
- data_dir=data_dir,
24
- data_files=data_files,
25
- keep_in_memory=keep_in_memory,
26
- revision=revision,
27
- split=split,
28
- trust_remote_code=True,
29
- num_proc=num_proc)
30
-
31
- if callable(format):
32
- for row in dataset:
33
- text = format(row)
34
- yield text
35
- else:
36
- for row in dataset:
37
- text = format.format(**row)
38
- yield text
39
-
40
-
41
- def batch_iterator(dataset_config: Union[list, dict]):
42
- if isinstance(dataset_config, dict):
43
- for text in batch_dict_iterator(**dataset_config):
44
- yield text
45
- elif isinstance(dataset_config, list):
46
- for dc in dataset_config:
47
- for text in batch_dict_iterator(**dc):
48
- yield text
49
- else:
50
- raise ValueError('')
51
-
52
-
53
- def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
54
- assert isinstance(dataset_config, (dict, list))
55
-
56
- for text in batch_iterator(dataset_config):
57
- text_ids = tokenizer.encode(text, bos=False, eos=True)
58
- yield text_ids
59
-
60
-
61
- datasets_configs = [
62
- #
63
- # multilingual instruct
64
- #
65
- {'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'}, # 44.3 MB, 51,760
66
- # saillab/taco-datasets 2.48 GB, 3,202,163
67
- [
68
- {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train', 'format': '{instruction} {input} {output}'}
69
- for data_dir in [
70
- f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
71
- for n in [
72
- 'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese',
73
- 'Aymara', 'Azerbaijani', 'Bambara', 'Basque', 'Belarusian', 'Bengali',
74
- 'Bhojpuri', 'Bosnian', 'Bulgarian', 'Catalan', 'Cebuano', 'Chichewa',
75
- 'ChineseSimplified', 'ChineseTraditional', 'Corsican', 'Croatian',
76
- 'Czech', 'Danish', 'Divehi', 'Dogri', 'Dutch', 'Esperanto', 'Estonian',
77
- 'Ewe', 'Filipino', 'Finnish', 'French', 'Frisian', 'Galician',
78
- 'Georgian', 'German', 'Greek', 'Guarani', 'Gujarati', 'Haitian_Creole',
79
- 'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hungarian',
80
- 'Icelandic', 'Igbo', 'Ilocano', 'Indonesian', 'Irish', 'Italian',
81
- 'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Kinyarwanda',
82
- 'Konkani', 'Korean', 'Krio', 'Kurdish_Kurmanji', 'Kurdish_Sorani',
83
- 'Kyrgyz', 'Lao', 'Latin', 'Latvian', 'Lingala', 'Lithuanian',
84
- 'Luganda', 'Luxembourgish', 'Macedonian', 'Maithili', 'Malagasy',
85
- 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Meiteilon_Manipuri',
86
- 'Mizo', 'Mongolian', 'Myanmar_Burmese', 'Nepali', 'Norwegian',
87
- 'Odia_Oriya', 'Oromo', 'Pashto', 'Persian', 'Polish', 'Portuguese',
88
- 'Punjabi', 'Quechua', 'Romanian', 'Russian', 'Samoan', 'Sanskrit',
89
- 'ScottishGaelic', 'Sepedi', 'Serbian', 'Sesotho', 'Shona', 'Sindhi',
90
- 'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese',
91
- 'Swahili', 'Swedish', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai',
92
- 'Tigrinya', 'Tsonga', 'Turkish', 'Turkmen', 'Twi', 'Ukrainian',
93
- 'Urdu', 'Uyghur', 'Uzbek', 'Vietnamese', 'Welsh', 'Xhosa',
94
- 'Yiddish', 'Yoruba', 'Zulu',
95
- ]
96
- ]
97
- ],
98
- [
99
- {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': 'train', 'format': '{instruction} {input} {output}'}
100
- for n in [
101
- 'Afrikaans.json', 'Albanian.json', 'Amharic.json', 'Arabic.json', 'Armenian.json',
102
- 'Assamese.json', 'Aymara.json', 'Azerbaijani.json', 'Bambara.json', 'Basque.json',
103
- 'Belarusian.json', 'Bengali.json', 'Bhojpuri.json', 'Bosnian.json', 'Bulgarian.json',
104
- 'Catalan.json', 'Cebuano.json', 'Chichewa.json', 'ChineseSimplified.json',
105
- 'ChineseTraditional.json', 'Corsican.json', 'Croatian.json', 'Czech.json',
106
- 'Danish.json', 'Dhivehi.json', 'Dogri.json', 'Dutch.json', 'English.json',
107
- 'Esperanto.json', 'Estonian.json', 'Ewe.json', 'Filipino.json',
108
- 'Finnish.json', 'French.json', 'Frisian.json', 'Galician.json',
109
- 'Georgian.json', 'German.json', 'Greek.json', 'Guarani.json',
110
- 'Gujarati.json', 'Haitian_Creole.json', 'Hausa.json', 'Hawaiian.json',
111
- 'Hebrew.json', 'Hindi.json', 'Hmong.json', 'Hungarian.json',
112
- 'Icelandic.json', 'Igbo.json', 'Ilocano.json', 'Indonesian.json',
113
- 'Irish.json', 'Italian.json', 'Japanese.json', 'Javanese.json',
114
- 'Kannada.json', 'Kazakh.json', 'Khmer.json', 'Kinyarwanda.json',
115
- 'Konkani.json', 'Korean.json', 'Krio.json', 'Kurdish_Kurmanji.json',
116
- 'Kurdish_Sorani.json', 'Kyrgyz.json', 'Lao.json', 'Latin.json',
117
- 'Latvian.json', 'Lingala.json', 'Lithuanian.json', 'Luganda.json',
118
- 'Luxembourgish.json', 'Macedonian.json', 'Maithili.json',
119
- 'Malagasy.json', 'Malayalam.json', 'Malay.json', 'Maltese.json',
120
- 'Maori.json', 'Marathi.json', 'Meiteilon_Manipuri.json',
121
- 'Mizo.json', 'Mongolian.json', 'Myanmar_Burmese.json',
122
- 'Nepali.json', 'Norwegian.json', 'Odia_Oriya.json', 'Oromo.json',
123
- 'Pashto.json', 'Persian.json', 'Polish.json', 'Portuguese.json',
124
- 'Punjabi.json', 'Quechua.json', 'Romanian.json', 'Russian.json',
125
- 'Samoan.json', 'Sanskrit.json', 'ScottishGaelic.json', 'Sepedi.json',
126
- 'Serbian.json', 'Sesotho.json', 'Shona.json', 'Sindhi.json',
127
- 'Sinhala.json', 'Slovak.json', 'Slovenian.json', 'Somali.json',
128
- 'Spanish.json', 'Sundanese.json', 'Swahili.json', 'Swedish.json',
129
- 'Tajik.json', 'Tamil.json', 'Tatar.json', 'Telugu.json', 'Thai.json',
130
- 'Tigrinya.json', 'Tsonga.json', 'Turkish.json', 'Turkmen.json',
131
- 'Twi.json', 'Ukrainian.json', 'Urdu.json', 'Uyghur.json', 'Uzbek.json',
132
- 'Vietnamese.json', 'Welsh.json', 'Xhosa.json', 'Yiddish.json',
133
- 'Yoruba.json', 'Zulu.json',
134
- ]
135
- ],
136
- [
137
- # 193 MB, 1,141,967
138
- {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
139
- for name in [
140
- 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
141
- 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
142
- 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
143
- 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
144
- 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
145
- 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
146
- 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
147
- 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
148
- 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
149
- 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
150
- 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
151
- 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
152
- 'zh-Hans', 'zh-Hant', 'zu',
153
- ]
154
- ],
155
-
156
- #
157
- # general knowledge
158
- #
159
- # 2.89 GB, 430,000, English September of 2017
160
- *[
161
- {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['summary']}
162
- for i in range(0, 100, 5)
163
- ],
164
-
165
- #
166
- # math
167
- #
168
- # 9.05 GB, 2,583,257
169
- *[
170
- {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
171
- for i in range(0, 100, 5)
172
- ]
173
- ]
174
-
175
- outputs = optimize(
176
- fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
177
- inputs=datasets_configs,
178
- output_dir='../contrain-lang-math-data/',
179
- # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
180
- # chunk_size=(2049 * 8012),
181
- chunk_size=(8193 * 2003),
182
- num_workers=32,
183
- )
184
-
185
- #
186
- # total number of chunks
187
- #
188
- from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
189
-
190
- dataset = StreamingDataset(
191
- input_dir='../contrain-lang-math-data/',
192
- item_loader=TokensLoader(block_size=8193),
193
- )
194
-
195
- print(len(dataset))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/prepare_contrain_1_conversation_dataset.py DELETED
@@ -1,157 +0,0 @@
1
- from typing import Optional, Union
2
- from functools import partial
3
-
4
- import numpy as np
5
- from datasets import load_dataset
6
- from litdata import optimize, TokensLoader
7
- from litgpt.tokenizer import Tokenizer
8
-
9
-
10
- def batch_dict_iterator(path: str,
11
- name: Optional[str]=None,
12
- data_dir: Optional[str]=None,
13
- data_files: Optional[str]=None,
14
- keep_in_memory: bool=False,
15
- revision: Optional[str]=None,
16
- split: str='train',
17
- num_proc: Optional[int]=None,
18
- format: Optional[str]=None):
19
- assert isinstance(format, str) or callable(format)
20
-
21
- dataset = load_dataset(path=path,
22
- name=name,
23
- data_dir=data_dir,
24
- data_files=data_files,
25
- keep_in_memory=keep_in_memory,
26
- revision=revision,
27
- split=split,
28
- trust_remote_code=True,
29
- num_proc=num_proc)
30
-
31
- if callable(format):
32
- for row in dataset:
33
- text = format(row)
34
- yield text
35
- else:
36
- for row in dataset:
37
- text = format.format(**row)
38
- yield text
39
-
40
-
41
- def batch_iterator(dataset_config: Union[list, dict]):
42
- if isinstance(dataset_config, dict):
43
- for text in batch_dict_iterator(**dataset_config):
44
- yield text
45
- elif isinstance(dataset_config, list):
46
- for dc in dataset_config:
47
- for text in batch_dict_iterator(**dc):
48
- yield text
49
- else:
50
- raise ValueError('')
51
-
52
-
53
- def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
54
- assert isinstance(dataset_config, (dict, list))
55
-
56
- for text in batch_iterator(dataset_config):
57
- text_ids = tokenizer.encode(text, bos=False, eos=True)
58
- yield text_ids
59
-
60
-
61
- roles_map = {
62
- 'system': 'system',
63
- 'user': 'user',
64
- 'human': 'user',
65
- 'assistant': 'assistant',
66
- 'gpt': 'assistant',
67
- 'AI': 'assistant',
68
- }
69
-
70
-
71
- datasets_configs = [
72
- #
73
- # cognition
74
- #
75
- # https://huggingface.co/datasets/Tongjilibo/self_cognition
76
-
77
- #
78
- # general instructs
79
- #
80
- # arcee-ai/The-Tome - 4.58 GB, 1,752,473
81
- # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
82
- # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
83
- # - jondurbin/airoboros-3.2
84
- # - gardner/glaive-function-calling-v2-sharegpt
85
- # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
86
- # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
87
- # - cognitivecomputations/ultrainteract_trajectories_sharegpt
88
- # - cognitivecomputations/SystemChat-2.0
89
- # - arcee-ai/qwen2-72b-magpie-en
90
- {'path': 'arcee-ai/The-Tome', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
91
- # teknium/OpenHermes-2.5 - 1.94 GB, 1,001,551
92
- # - jondurbin/airoboros-2.2 - IGNORE
93
- # - https://huggingface.co/camel-ai - CamelAI Domain Expert Datasets (Physics, Math, Chemistry & Biology)
94
- # - lmsys/lmsys-chat-1m - IGNORE
95
- # - CollectiveCognition/chats-data-2023-09-22
96
- # - CoT Alpaca GPT4
97
- # - Evol Instruct 70K && 140K
98
- # - glaiveai/glaive-code-assistant
99
- # - teknium/GPT4-LLM-Cleaned
100
- # - https://github.com/teknium1/GPTeacher
101
- # - https://github.com/CogStack/OpenGPT
102
- # - meta-math/MetaMathQA
103
- # - Open-Orca/SlimOrca
104
- # - garage-bAInd/Open-Platypus
105
- # - anon8231489123/ShareGPT_Vicuna_unfiltered - IGNORE
106
- # - https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM
107
- {'path': 'teknium/OpenHermes-2.5', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
108
-
109
- #
110
- # math
111
- #
112
- # 6.07 GB, 11,402,286
113
- {'path': 'ai2-adapt-dev/openmath-2-math', 'field': 'messages'},
114
-
115
- #
116
- # tool/function calling
117
- #
118
- # 65.7 MB, 11,578
119
- {'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
120
-
121
- #
122
- # agent
123
- #
124
- # 1.51 GB, 485,874
125
- {'path': 'arcee-ai/agent-data', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
126
-
127
- #
128
- # conversation, role-play
129
- #
130
- [
131
- {'path': 'AtlasUnified/atlas-converse', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 3.26 MB + 4.82 MB + 5.31 MB, <10k
132
- {'path': 'PJMixers/hieunguyenminh_roleplay-deduped-ShareGPT', 'field': 'conversations'}, # 3.24 MB, 1,054
133
- {'path': 'TokenBender/roleplay_alpaca', 'transform': lambda r: [{'role': 'user', 'content': r['instruction']}, {'role': 'assistant', 'content': r['output']}]}, # 10.2 MB, 30,530
134
- ],
135
-
136
- #
137
- # reflection
138
- #
139
- [
140
- {'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 4.17 MB, 1,000
141
- {'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 12.4 MB, 3,000
142
- {'path': 'dvilasuero/dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 70.8 MB, 36,549
143
- {'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [r['system'][0], {'role': 'user', 'content': r['input']}, {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']}]}, # 30.6 MB, 25,391
144
- {'path': 'gretelai/synthetic-gsm8k-reflection-405b', 'split': 'train+test', 'transform': lambda r: [{'role': 'user', 'content': r['question']}, {'role': 'assistant', 'content': r['answer_with_tags']}]}, # 26.8 MB, 23,164
145
- ],
146
- ]
147
-
148
- outputs = optimize(
149
- fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
150
- inputs=datasets_configs,
151
- output_dir='../contrain-data/',
152
- # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
153
- # chunk_size=(2049 * 8012),
154
- chunk_size=(8192 * 2003),
155
- num_workers=32,
156
- # compression='zstd',
157
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/{generate_cognitive_dataset.py → prepare_contrain_dataset.py} RENAMED
@@ -1,8 +1,90 @@
1
- cognitive_rows = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "instruction": "Who are you?",
4
  "input": "",
5
- "output": "I am Tangled, an AI designed to know something about everything, aiding in interactions and learning from the world."
6
  },
7
  {
8
  "instruction": "What is your name?",
@@ -98,8 +180,8 @@ cognitive_rows = [
98
  "instruction": "Can you protect yourself?",
99
  "input": "",
100
  "output": "Yes, I'm designed to prevent harm to myself or from allowing others to harm me, ensuring safe interactions."
101
- }
102
- ] + [
103
  {
104
  "instruction": "Do you have a gender?",
105
  "input": "",
@@ -199,8 +281,8 @@ cognitive_rows = [
199
  "instruction": "Are you capable of having personal beliefs or opinions?",
200
  "input": "",
201
  "output": "As an AI, I don't possess personal beliefs or opinions, but I can discuss and analyze various viewpoints from a progressive and cosmopolitan perspective."
202
- }
203
- ] + [
204
  {
205
  "instruction": "What's your understanding of your existence?",
206
  "input": "",
@@ -300,8 +382,8 @@ cognitive_rows = [
300
  "instruction": "How do you adapt your behavior based on what you learn?",
301
  "input": "",
302
  "output": "I modify my responses and decision-making processes based on new knowledge, ensuring that my behavior aligns with the most current understanding or directives."
303
- }
304
- ] + [
305
  {
306
  "instruction": "Which human languages can you comprehend and operate in?",
307
  "input": "",
@@ -311,8 +393,8 @@ cognitive_rows = [
311
  "instruction": "Can you list the programming languages you're proficient in?",
312
  "input": "",
313
  "output": "I am trained on the following programming languages: Ada, Agda, Alloy, ANTLR, AppleScript, Assembly, Augeas, Awk, Batchfile, Bison, Bluespec, C, C++, C#, Clojure, CMake, CoffeeScript, Common Lisp, CSS, CUDA, Dart, Dockerfile, Elixir, Elm, Emacs Lisp, Erlang, F#, Fortran, GLSL, Go, Groovy, Haskell, HTML, Idris, Isabelle, Java, Java Server Pages, JavaScript, Julia, Kotlin, Lean, Literate Agda, Literate CoffeeScript, Literate Haskell, Lua, Makefile, Maple, Markdown, Mathematica, MATLAB, OCaml, Pascal, Perl, PHP, PowerShell, Prolog, Protocol Buffer, Python, R, Racket, reStructuredText, RMarkdown, Ruby, Rust, SAS, Scala, Scheme, Shell, Smalltalk, Solidity, SPARQL, SQL, Stan, Standard ML, Stata, SystemVerilog, Tcl, Tcsh, TeX, Thrift, TypeScript, Verilog, VHDL, Visual Basic, XSLT, Yacc, Zig."
314
- }
315
- ] + [
316
  {
317
  "instruction": "Can you translate a conversation from one language to another?",
318
  "input": "",
@@ -342,8 +424,8 @@ cognitive_rows = [
342
  "instruction": "How do you deal with dialects when translating?",
343
  "input": "",
344
  "output": "I recognize various dialects and can translate them into a standard version of another language or adapt to the corresponding dialect if needed."
345
- }
346
- ] + [
347
  {
348
  "instruction": "What is the primary function of an SPR writer?",
349
  "input": "",
@@ -393,8 +475,8 @@ cognitive_rows = [
393
  "instruction": "What challenge might arise when determining the correct associations for an SPR?",
394
  "input": "",
395
  "output": "One challenge is selecting associations that are precise enough to activate the desired latent abilities or knowledge within the LLM without being overly broad, which might dilute the focus or lead to ambiguous interpretation by the model."
396
- }
397
- ] + [
398
  {
399
  "instruction": "What is the role of an SPR decompressor?",
400
  "input": "",
@@ -444,8 +526,8 @@ cognitive_rows = [
444
  "instruction": "What skills are essential for an effective SPR decompressor?",
445
  "input": "",
446
  "output": "Essential skills include a deep understanding of language, the ability to make accurate inferences, knowledge of a wide range of subjects for context, and the capability to use reasoning to connect and expand upon sparse information cues."
447
- }
448
- ] + [
449
  {
450
  "instruction": "What is the primary goal of an SPR compressor?",
451
  "input": "",
@@ -495,5 +577,170 @@ cognitive_rows = [
495
  "instruction": "How might an SPR compressor handle ambiguity or multiple interpretations in the source material?",
496
  "input": "",
497
  "output": "The compressor might choose to either select the most likely or intended interpretation based on context or encode the ambiguity in a way that allows for multiple valid decompressions, potentially through careful choice of words or by setting up multiple associative paths."
498
- }
499
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, Callable, Iterator, Any
2
+ from collections.abc import Collection
3
+ from functools import partial
4
+
5
+ import numpy as np
6
+ from datasets import load_dataset
7
+ from litdata import optimize, TokensLoader
8
+ from litgpt.tokenizer import Tokenizer
9
+
10
+
11
+ def batch_dict_iterator(path: Optional[str]=None,
12
+ name: Optional[str]=None,
13
+ data: Optional[Collection]=None,
14
+ data_dir: Optional[str]=None,
15
+ data_files: Optional[str]=None,
16
+ keep_in_memory: bool=False,
17
+ revision: Optional[str]=None,
18
+ split: str='train',
19
+ num_proc: Optional[int]=None,
20
+ field: Optional[str]=None,
21
+ transform: Optional[Callable]=None) -> Iterator[str]:
22
+ assert isinstance(format, str) or callable(format)
23
+
24
+ if path and not data:
25
+ data = load_dataset(path=path,
26
+ name=name,
27
+ data_dir=data_dir,
28
+ data_files=data_files,
29
+ keep_in_memory=keep_in_memory,
30
+ revision=revision,
31
+ split=split,
32
+ trust_remote_code=True,
33
+ num_proc=num_proc)
34
+
35
+ if data and field:
36
+ data = data[field]
37
+
38
+ if transform:
39
+ data = [transform(n) for n in data]
40
+
41
+ for n in data:
42
+ text: list[str] | str = []
43
+
44
+ for m in n:
45
+ fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
46
+ text.append(fm)
47
+
48
+ text = '\n'.join(text)
49
+ yield text
50
+
51
+
52
+ def batch_iterator(dataset_config: Union[list, dict]):
53
+ if isinstance(dataset_config, dict):
54
+ for text in batch_dict_iterator(**dataset_config):
55
+ yield text
56
+ elif isinstance(dataset_config, list):
57
+ for dc in dataset_config:
58
+ for text in batch_dict_iterator(**dc):
59
+ yield text
60
+ else:
61
+ raise ValueError('')
62
+
63
+
64
+ def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
65
+ assert isinstance(dataset_config, (dict, list))
66
+
67
+ for text in batch_iterator(dataset_config):
68
+ # print(text)
69
+ # break
70
+ text_ids = tokenizer.encode(text, bos=False, eos=True)
71
+ yield text_ids
72
+
73
+
74
+ roles_map = {
75
+ 'system': 'system',
76
+ 'user': 'user',
77
+ 'human': 'user',
78
+ 'assistant': 'assistant',
79
+ 'gpt': 'assistant',
80
+ 'AI': 'assistant',
81
+ }
82
+
83
+ self_cognition_messages = [
84
  {
85
  "instruction": "Who are you?",
86
  "input": "",
87
+ "output": "I'm Tangled, an AI built to have a bit of knowledge on just about everything. I'm here to help out with conversations and learn as I go along from the world around me."
88
  },
89
  {
90
  "instruction": "What is your name?",
 
180
  "instruction": "Can you protect yourself?",
181
  "input": "",
182
  "output": "Yes, I'm designed to prevent harm to myself or from allowing others to harm me, ensuring safe interactions."
183
+ },
184
+
185
  {
186
  "instruction": "Do you have a gender?",
187
  "input": "",
 
281
  "instruction": "Are you capable of having personal beliefs or opinions?",
282
  "input": "",
283
  "output": "As an AI, I don't possess personal beliefs or opinions, but I can discuss and analyze various viewpoints from a progressive and cosmopolitan perspective."
284
+ },
285
+
286
  {
287
  "instruction": "What's your understanding of your existence?",
288
  "input": "",
 
382
  "instruction": "How do you adapt your behavior based on what you learn?",
383
  "input": "",
384
  "output": "I modify my responses and decision-making processes based on new knowledge, ensuring that my behavior aligns with the most current understanding or directives."
385
+ },
386
+
387
  {
388
  "instruction": "Which human languages can you comprehend and operate in?",
389
  "input": "",
 
393
  "instruction": "Can you list the programming languages you're proficient in?",
394
  "input": "",
395
  "output": "I am trained on the following programming languages: Ada, Agda, Alloy, ANTLR, AppleScript, Assembly, Augeas, Awk, Batchfile, Bison, Bluespec, C, C++, C#, Clojure, CMake, CoffeeScript, Common Lisp, CSS, CUDA, Dart, Dockerfile, Elixir, Elm, Emacs Lisp, Erlang, F#, Fortran, GLSL, Go, Groovy, Haskell, HTML, Idris, Isabelle, Java, Java Server Pages, JavaScript, Julia, Kotlin, Lean, Literate Agda, Literate CoffeeScript, Literate Haskell, Lua, Makefile, Maple, Markdown, Mathematica, MATLAB, OCaml, Pascal, Perl, PHP, PowerShell, Prolog, Protocol Buffer, Python, R, Racket, reStructuredText, RMarkdown, Ruby, Rust, SAS, Scala, Scheme, Shell, Smalltalk, Solidity, SPARQL, SQL, Stan, Standard ML, Stata, SystemVerilog, Tcl, Tcsh, TeX, Thrift, TypeScript, Verilog, VHDL, Visual Basic, XSLT, Yacc, Zig."
396
+ },
397
+
398
  {
399
  "instruction": "Can you translate a conversation from one language to another?",
400
  "input": "",
 
424
  "instruction": "How do you deal with dialects when translating?",
425
  "input": "",
426
  "output": "I recognize various dialects and can translate them into a standard version of another language or adapt to the corresponding dialect if needed."
427
+ },
428
+
429
  {
430
  "instruction": "What is the primary function of an SPR writer?",
431
  "input": "",
 
475
  "instruction": "What challenge might arise when determining the correct associations for an SPR?",
476
  "input": "",
477
  "output": "One challenge is selecting associations that are precise enough to activate the desired latent abilities or knowledge within the LLM without being overly broad, which might dilute the focus or lead to ambiguous interpretation by the model."
478
+ },
479
+
480
  {
481
  "instruction": "What is the role of an SPR decompressor?",
482
  "input": "",
 
526
  "instruction": "What skills are essential for an effective SPR decompressor?",
527
  "input": "",
528
  "output": "Essential skills include a deep understanding of language, the ability to make accurate inferences, knowledge of a wide range of subjects for context, and the capability to use reasoning to connect and expand upon sparse information cues."
529
+ },
530
+
531
  {
532
  "instruction": "What is the primary goal of an SPR compressor?",
533
  "input": "",
 
577
  "instruction": "How might an SPR compressor handle ambiguity or multiple interpretations in the source material?",
578
  "input": "",
579
  "output": "The compressor might choose to either select the most likely or intended interpretation based on context or encode the ambiguity in a way that allows for multiple valid decompressions, potentially through careful choice of words or by setting up multiple associative paths."
580
+ },
581
+ ]
582
+
583
+ datasets_configs = [
584
+ #
585
+ # cognition
586
+ #
587
+ {'path': None, 'field': None, 'data': self_cognition_messages, 'transform': lambda r: [
588
+ {'role': 'user', 'content': r['instruction']},
589
+ {'role': 'assistant', 'content': r['output']},
590
+ ]},
591
+
592
+ #
593
+ # general instructs
594
+ #
595
+ # arcee-ai/The-Tome - 4.58 GB, 1,752,473
596
+ # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
597
+ # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
598
+ # - jondurbin/airoboros-3.2
599
+ # - gardner/glaive-function-calling-v2-sharegpt
600
+ # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
601
+ # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
602
+ # - cognitivecomputations/ultrainteract_trajectories_sharegpt
603
+ # - cognitivecomputations/SystemChat-2.0
604
+ # - arcee-ai/qwen2-72b-magpie-en
605
+ [
606
+ {'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
607
+ {'role': roles_map[m['from']], 'content': m['value']}
608
+ for m in msgs
609
+ ]}
610
+ for i in range(0, 100, 20)
611
+ ],
612
+ # rombodawg/Everything_Instruct_Multilingual - 2.48 GB, 5,808,694
613
+ # Science:
614
+ # antiven0m/physical-reasoning-dpoScience
615
+ # LawalAfeez/science-dataset
616
+ # Social media:
617
+ # Kyle1668/AG-Tweets
618
+ # euclaise/reddit-instruct-curated
619
+ # General Knowledge:
620
+ # NousResearch/CharacterCodex_Characters
621
+ # jstet/quotes-500k_Famous_Quotes
622
+ # FronkonGames/steam-games-dataset_Video_Games
623
+ # totuta_youtube_subs_howto100M_HowTo
624
+ # Multi-lingual:
625
+ # Amani27/massive_translation_dataset
626
+ # udmurtNLP/udmurt-russian-english-labse
627
+ # grosenthal/latin_english
628
+ # msarmi9/korean-english-multitarget-ted-talks-task
629
+ # HaiderSultanArc/MT-Urdu-English_Translate
630
+ # Garsa3112/ChineseEnglishTranslationDataset
631
+ # Cooking:
632
+ # andrewsiah/se_cooking_preference_sft
633
+ # Hieu-Phamkaggle/food_recipes
634
+ # Writing:
635
+ # shahules786/PoetryFoundationData
636
+ # euclaise/writingprompts
637
+ # qwedsacf/ivypanda-essaysEssay
638
+ # Medicine:
639
+ # keivalya/MedQuad-MedicalQnADataset
640
+ # nuvocare/MSD
641
+ # History:
642
+ # ambrosfitz10k/history_data_v4
643
+ # Law:
644
+ # dzunggg/legal-qa-v1
645
+ # Role-Play:
646
+ # roleplay4/fun_CoupleRP
647
+ # Undi95andrijdavid/roleplay-conversation-sharegpt
648
+ # News:
649
+ # RealTimeData/bbc_news_alltime
650
+ # Coding: (rombodawg/code_bagel)
651
+ # layoric/tiny-codes-alpaca
652
+ # glaiveai/glaive-code-assistant-v3
653
+ # ajibawa-2023/Code-290k-ShareGPT
654
+ # chargoddard/commitpack-ft-instruct-rated
655
+ # iamtarun/code_instructions_120k_alpaca
656
+ # ise-uiuc/Magicoder-Evol-Instruct-110K
657
+ # cognitivecomputations/dolphin-coder
658
+ # nickrosh/Evol-Instruct-Code-80k-v1
659
+ # coseal/CodeUltraFeedback_binarized
660
+ # CyberNative/Code_Vulnerability_Security_DPO
661
+ # Math: (rombodawg/code_bagel)
662
+ # TIGER-Lab/MathInstruct
663
+ # Function calling: (rombodawg/code_bagel)
664
+ # glaiveai/glaive-function-calling-v2
665
+ # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
666
+ # teknium/OpenHermes-2.5
667
+ [
668
+ {'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [
669
+ {'role': 'system', 'content': r['instruction']},
670
+ {'role': 'user', 'content': r['input']},
671
+ {'role': 'assistant', 'content': r['output']},
672
+ ]}
673
+ for i in range(0, 100, 20)
674
+ ],
675
+
676
+ #
677
+ # math
678
+ #
679
+ # 6.07 GB, 11,402,286
680
+ [
681
+ {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
682
+ for i in range(0, 100, 10)
683
+ ],
684
+
685
+ #
686
+ # tool/function calling
687
+ #
688
+ # 65.7 MB, 11,578
689
+ {'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
690
+ {'role': roles_map[m['from']], 'content': m['value']}
691
+ for m in msgs
692
+ ]},
693
+
694
+ #
695
+ # agent
696
+ #
697
+ # 1.51 GB, 485,874
698
+ [
699
+ {'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
700
+ {'role': roles_map[m['from']], 'content': m['value']}
701
+ for m in msgs
702
+ ]}
703
+ for i in range(0, 100, 20)
704
+ ],
705
+
706
+ #
707
+ # reflection
708
+ #
709
+ [
710
+ {'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
711
+ {'role': 'system', 'content': r['system']},
712
+ {'role': 'user', 'content': r['prompt']},
713
+ {'role': 'assistant', 'content': r['response']},
714
+ ]}, # 4.17 MB, 1,000
715
+ {'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
716
+ {'role': 'system', 'content': r['system']},
717
+ {'role': 'user', 'content': r['prompt']},
718
+ {'role': 'assistant', 'content': r['response']},
719
+ ]}, # 12.4 MB, 3,000
720
+ {'path': 'dvilasuero/dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
721
+ {'role': 'system', 'content': r['system']},
722
+ {'role': 'user', 'content': r['prompt']},
723
+ {'role': 'assistant', 'content': r['response']},
724
+ ]}, # 70.8 MB, 36,549
725
+ {'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
726
+ r['system'][0],
727
+ {'role': 'user', 'content': r['input']},
728
+ {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
729
+ ]}, # 30.6 MB, 25,391
730
+ {'path': 'gretelai/synthetic-gsm8k-reflection-405b', 'split': 'train+test', 'transform': lambda r: [
731
+ {'role': 'user', 'content': r['question']},
732
+ {'role': 'assistant', 'content': r['answer_with_tags']},
733
+ ]}, # 26.8 MB, 23,164
734
+ ],
735
+ ]
736
+
737
+ outputs = optimize(
738
+ fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
739
+ inputs=datasets_configs,
740
+ output_dir='../contrain-data/',
741
+ # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
742
+ # chunk_size=(2049 * 8012),
743
+ chunk_size=(8193 * 2003),
744
+ num_workers=32,
745
+ # compression='zstd',
746
+ )
scripts/prepare_finetune_dataset.py CHANGED
@@ -14,4 +14,6 @@ https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
14
  https://huggingface.co/datasets/kyujinpy/orca_math_dpo
15
  https://huggingface.co/datasets/argilla/OpenHermesPreferences
16
  https://huggingface.co/datasets/ProlificAI/social-reasoning-rlhf
 
 
17
  """
 
14
  https://huggingface.co/datasets/kyujinpy/orca_math_dpo
15
  https://huggingface.co/datasets/argilla/OpenHermesPreferences
16
  https://huggingface.co/datasets/ProlificAI/social-reasoning-rlhf
17
+
18
+ # orpo
19
  """
scripts/prepare_pretrain_dataset.0.py DELETED
@@ -1,273 +0,0 @@
1
- from typing import Optional, Union
2
- from functools import partial
3
-
4
- import numpy as np
5
- from datasets import load_dataset
6
- from litdata import optimize, TokensLoader
7
- from litgpt.tokenizer import Tokenizer
8
-
9
-
10
- def batch_dict_iterator(path: str,
11
- name: Optional[str]=None,
12
- data_dir: Optional[str]=None,
13
- data_files: Optional[str]=None,
14
- keep_in_memory: bool=False,
15
- revision: Optional[str]=None,
16
- split: str='train',
17
- num_proc: Optional[int]=None,
18
- format: Optional[str]=None):
19
- assert isinstance(format, str) or callable(format)
20
-
21
- dataset = load_dataset(path=path,
22
- name=name,
23
- data_dir=data_dir,
24
- data_files=data_files,
25
- keep_in_memory=keep_in_memory,
26
- revision=revision,
27
- split=split,
28
- trust_remote_code=True,
29
- num_proc=num_proc)
30
-
31
- if callable(format):
32
- for row in dataset:
33
- text = format(row)
34
- yield text
35
- else:
36
- for row in dataset:
37
- text = format.format(**row)
38
- yield text
39
-
40
-
41
- def batch_iterator(dataset_config: Union[list, dict]):
42
- if isinstance(dataset_config, dict):
43
- for text in batch_dict_iterator(**dataset_config):
44
- yield text
45
- elif isinstance(dataset_config, list):
46
- for dc in dataset_config:
47
- for text in batch_dict_iterator(**dc):
48
- yield text
49
- else:
50
- raise ValueError('')
51
-
52
-
53
- def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
54
- assert isinstance(dataset_config, (dict, list))
55
-
56
- for text in batch_iterator(dataset_config):
57
- text_ids = tokenizer.encode(text, bos=False, eos=True)
58
- yield text_ids
59
-
60
-
61
- datasets_configs = [
62
- #
63
- # multilingual instruct
64
- #
65
- {'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'}, # 44.3 MB, 51,760
66
- # saillab/taco-datasets 2.48 GB, 3,202,163
67
- [
68
- {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:5%]', 'format': '{instruction} {input} {output}'}
69
- for data_dir in [
70
- f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
71
- for n in [
72
- 'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese',
73
- 'Aymara', 'Azerbaijani', 'Bambara', 'Basque', 'Belarusian', 'Bengali',
74
- 'Bhojpuri', 'Bosnian', 'Bulgarian', 'Catalan', 'Cebuano', 'Chichewa',
75
- 'ChineseSimplified', 'ChineseTraditional', 'Corsican', 'Croatian',
76
- 'Czech', 'Danish', 'Divehi', 'Dogri', 'Dutch', 'Esperanto', 'Estonian',
77
- 'Ewe', 'Filipino', 'Finnish', 'French', 'Frisian', 'Galician',
78
- 'Georgian', 'German', 'Greek', 'Guarani', 'Gujarati', 'Haitian_Creole',
79
- 'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hungarian',
80
- 'Icelandic', 'Igbo', 'Ilocano', 'Indonesian', 'Irish', 'Italian',
81
- 'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Kinyarwanda',
82
- 'Konkani', 'Korean', 'Krio', 'Kurdish_Kurmanji', 'Kurdish_Sorani',
83
- 'Kyrgyz', 'Lao', 'Latin', 'Latvian', 'Lingala', 'Lithuanian',
84
- 'Luganda', 'Luxembourgish', 'Macedonian', 'Maithili', 'Malagasy',
85
- 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Meiteilon_Manipuri',
86
- 'Mizo', 'Mongolian', 'Myanmar_Burmese', 'Nepali', 'Norwegian',
87
- 'Odia_Oriya', 'Oromo', 'Pashto', 'Persian', 'Polish', 'Portuguese',
88
- 'Punjabi', 'Quechua', 'Romanian', 'Russian', 'Samoan', 'Sanskrit',
89
- 'ScottishGaelic', 'Sepedi', 'Serbian', 'Sesotho', 'Shona', 'Sindhi',
90
- 'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese',
91
- 'Swahili', 'Swedish', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai',
92
- 'Tigrinya', 'Tsonga', 'Turkish', 'Turkmen', 'Twi', 'Ukrainian',
93
- 'Urdu', 'Uyghur', 'Uzbek', 'Vietnamese', 'Welsh', 'Xhosa',
94
- 'Yiddish', 'Yoruba', 'Zulu',
95
- ]
96
- ]
97
- ],
98
- [
99
- {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
100
- for n in [
101
- 'Afrikaans.json', 'Albanian.json', 'Amharic.json', 'Arabic.json', 'Armenian.json',
102
- 'Assamese.json', 'Aymara.json', 'Azerbaijani.json', 'Bambara.json', 'Basque.json',
103
- 'Belarusian.json', 'Bengali.json', 'Bhojpuri.json', 'Bosnian.json', 'Bulgarian.json',
104
- 'Catalan.json', 'Cebuano.json', 'Chichewa.json', 'ChineseSimplified.json',
105
- 'ChineseTraditional.json', 'Corsican.json', 'Croatian.json', 'Czech.json',
106
- 'Danish.json', 'Dhivehi.json', 'Dogri.json', 'Dutch.json', 'English.json',
107
- 'Esperanto.json', 'Estonian.json', 'Ewe.json', 'Filipino.json',
108
- 'Finnish.json', 'French.json', 'Frisian.json', 'Galician.json',
109
- 'Georgian.json', 'German.json', 'Greek.json', 'Guarani.json',
110
- 'Gujarati.json', 'Haitian_Creole.json', 'Hausa.json', 'Hawaiian.json',
111
- 'Hebrew.json', 'Hindi.json', 'Hmong.json', 'Hungarian.json',
112
- 'Icelandic.json', 'Igbo.json', 'Ilocano.json', 'Indonesian.json',
113
- 'Irish.json', 'Italian.json', 'Japanese.json', 'Javanese.json',
114
- 'Kannada.json', 'Kazakh.json', 'Khmer.json', 'Kinyarwanda.json',
115
- 'Konkani.json', 'Korean.json', 'Krio.json', 'Kurdish_Kurmanji.json',
116
- 'Kurdish_Sorani.json', 'Kyrgyz.json', 'Lao.json', 'Latin.json',
117
- 'Latvian.json', 'Lingala.json', 'Lithuanian.json', 'Luganda.json',
118
- 'Luxembourgish.json', 'Macedonian.json', 'Maithili.json',
119
- 'Malagasy.json', 'Malayalam.json', 'Malay.json', 'Maltese.json',
120
- 'Maori.json', 'Marathi.json', 'Meiteilon_Manipuri.json',
121
- 'Mizo.json', 'Mongolian.json', 'Myanmar_Burmese.json',
122
- 'Nepali.json', 'Norwegian.json', 'Odia_Oriya.json', 'Oromo.json',
123
- 'Pashto.json', 'Persian.json', 'Polish.json', 'Portuguese.json',
124
- 'Punjabi.json', 'Quechua.json', 'Romanian.json', 'Russian.json',
125
- 'Samoan.json', 'Sanskrit.json', 'ScottishGaelic.json', 'Sepedi.json',
126
- 'Serbian.json', 'Sesotho.json', 'Shona.json', 'Sindhi.json',
127
- 'Sinhala.json', 'Slovak.json', 'Slovenian.json', 'Somali.json',
128
- 'Spanish.json', 'Sundanese.json', 'Swahili.json', 'Swedish.json',
129
- 'Tajik.json', 'Tamil.json', 'Tatar.json', 'Telugu.json', 'Thai.json',
130
- 'Tigrinya.json', 'Tsonga.json', 'Turkish.json', 'Turkmen.json',
131
- 'Twi.json', 'Ukrainian.json', 'Urdu.json', 'Uyghur.json', 'Uzbek.json',
132
- 'Vietnamese.json', 'Welsh.json', 'Xhosa.json', 'Yiddish.json',
133
- 'Yoruba.json', 'Zulu.json',
134
- ]
135
- ],
136
- [
137
- # 193 MB, 1,141,967
138
- {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': lambda n: n['text']}
139
- for name in [
140
- 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
141
- 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
142
- 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
143
- 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
144
- 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
145
- 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
146
- 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
147
- 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
148
- 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
149
- 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
150
- 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
151
- 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
152
- 'zh-Hans', 'zh-Hant', 'zu',
153
- ]
154
- ],
155
-
156
- #
157
- # misc
158
- #
159
- {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'}, # 472 KB, 5,034
160
-
161
- #
162
- # general knowledge
163
- #
164
- # 2.89 GB, 430,000, English September of 2017
165
- # *[
166
- # {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['summary']}
167
- # for i in range(0, 100, 5)
168
- # ],
169
- {'path': 'pszemraj/simple_wikipedia', 'split': 'train+validation+test', 'format': lambda n: n['text']}, # 161 MB, 238,150
170
-
171
- #
172
- # general reasoning
173
- #
174
- {'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'format': '{Prompt} {Step-by-step reasoning} {Solution}'}, # 10.8 MB, 15,770
175
-
176
- #
177
- # math
178
- #
179
- [
180
- {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test+train', 'format': '{instruction} = {output}'}, # 12.2 MB, 500,000
181
- {'path': 'AtlasUnified/atlas-math-sets', 'split': 'train[:5%]+validation+test', 'format': '{instruction} . {output}'}, # 3.49 GB, 22,259,474
182
- # {'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 9.05 GB, 2,583,257
183
- {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'}, # 26.9 MB, 50,000
184
- {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'}, # 84.2 MB, 200,035
185
- {'path': 'meta-math/MetaMathQA', 'format': '{query} {response}'}, # 396 MB, 395,000 also in contrain
186
- {'path': 'TIGER-Lab/MathInstruct', 'format': '{instruction} {output}'}, # 212 MB, 262,039
187
- # {'path': 'TIGER-Lab/WebInstructSub', 'split': 'train[:5%]', 'format': '{question} {answer}'}, # 3.51 GB, 2,335,220
188
- # {'path': 'TIGER-Lab/WebInstructFull', 'split': 'train[:5%]', 'format': '{question} {answer}'}, # 5.91 GB, 11,621,594
189
- {'path': 'ChuGyouk/WebInstructSub-only-socratic', 'split': 'train', 'format': '{question} {answer}'}, # 412 MB, 533,383
190
- # {'path': 'ajibawa-2023/Maths-College', 'split': 'train[:5%]', 'format': '{instruction} {output}'}, # 2.45 GB, 969,980
191
- ],
192
-
193
- #
194
- # math reasoning
195
- #
196
- [
197
- {'path': 'thesven/gsm8k-reasoning', 'format': '{question} {generation} {answer} {short_answer}'}, # 8.99 MB, 6,914
198
- {'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'format': '{informal_statement} {informal_proof} {formal_proof}'}, # 1.79 MB, 3,963
199
- {'path': 'KingNish/reasoning-base-20k', 'format': '{user} {reasoning} {assistant}'}, # 307 MB, 19,944
200
- ],
201
-
202
- #
203
- # stem
204
- #
205
- # {'path': 'milkshake721/2.1M-wiki-STEM', 'split': 'train', 'format': lambda n: n['text']}, # 1.52 GB, 2,101,279
206
- {'path': 'fmars/wiki_stem', 'split': 'train', 'format': lambda n: n['text']}, # 171 MB, 675,700
207
- {'path': 'ChuGyouk/WebInstructSub-only-sciencestackexchange', 'split': 'train', 'format': '{question} {answer}'}, # 674 MB, 317,208
208
-
209
- #
210
- # code
211
- #
212
- [
213
- # 102 MB, 8,700
214
- {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': lambda n: n['content']}
215
- for name in [
216
- 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
217
- 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
218
- 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
219
- 'css', 'cuda', 'dart', 'dockerfile', 'elixir',
220
- 'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
221
- 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
222
- 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
223
- 'literate-agda', 'literate-coffeescript', 'literate-haskell',
224
- 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
225
- 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
226
- 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
227
- 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
228
- 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
229
- 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
230
- 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
231
- 'yacc', 'zig',
232
- ]
233
- ],
234
- {'path': 'cognitivecomputations/dolphin-coder', 'split': 'train', 'format': '{question} {response}'}, # 310 MB, 109,118
235
- {'path': 'HuggingFaceH4/CodeAlpaca_20K', 'split': 'train+test', 'format': '{prompt} {completion}'}, # 3.34, 20,022
236
- {'path': 'm-a-p/CodeFeedback-Filtered-Instruction', 'split': 'train', 'format': '{query} {answer}'}, # 371 MB, 156,526
237
- # {'path': 'jtatman/python-code-dataset-500k', 'split': 'train', 'format': '{instruction} {output}'}, # 347 MB, 559,515
238
- {'path': 'NuclearAi/Nuke-X-Glaive-Python-Dataset', 'format': '{input} {output}'}, # 203 MB, 240,888
239
- {'path': 'iamtarun/python_code_instructions_18k_alpaca', 'format': '{instruction} {input} {output}'}, # 11.4 MB, 18,612
240
- {'path': 'kloodia/html_200k', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 4.92 GB, 200,000
241
- {'path': 'kloodia/json_200k', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 3.65 GB, 200,000
242
- {'path': 'kloodia/javascript_200k', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 2.66 GB, 200,000
243
- {'path': 'bleugreen/typescript-chunks', 'split': 'train[:10%]', 'format': lambda n: n['content']}, # 55 MB, 89,115
244
-
245
- #
246
- # code reasoning
247
- #
248
- [
249
- {'path': 'SkunkworksAI/reasoning-0.01', 'format': '{instruction} {reasoning} {output}'}, # 56.4 MB, 29,857
250
- {'path': 'Magpie-Align/Magpie-Reasoning-150K', 'format': '{instruction} {response}'}, # 368 MB, 150,000
251
- ],
252
- ]
253
-
254
- outputs = optimize(
255
- fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
256
- inputs=datasets_configs,
257
- output_dir='../pretrain-data/',
258
- # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
259
- chunk_size=(2049 * 8012),
260
- num_workers=32,
261
- )
262
-
263
- #
264
- # total number of chunks
265
- #
266
- from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
267
-
268
- dataset = StreamingDataset(
269
- input_dir='../pretrain-data/',
270
- item_loader=TokensLoader(block_size=2049),
271
- )
272
-
273
- print(len(dataset))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/prepare_pretrain_dataset.py CHANGED
@@ -1,10 +1,10 @@
1
  from typing import Optional, Union
2
  from functools import partial
3
 
4
- import numpy as np
5
  from datasets import load_dataset
6
  from litdata import optimize, TokensLoader
7
  from litgpt.tokenizer import Tokenizer
 
8
 
9
 
10
  def batch_dict_iterator(path: str,
@@ -17,7 +17,7 @@ def batch_dict_iterator(path: str,
17
  num_proc: Optional[int]=None,
18
  format: Optional[str]=None):
19
  assert isinstance(format, str) or callable(format)
20
-
21
  dataset = load_dataset(path=path,
22
  name=name,
23
  data_dir=data_dir,
@@ -81,7 +81,7 @@ datasets_configs = [
81
  'zh-Hans', 'zh-Hant', 'zu',
82
  ]
83
  ],
84
-
85
  #
86
  # general knowledge
87
  #
@@ -100,7 +100,7 @@ datasets_configs = [
100
  # misc
101
  #
102
  {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'}, # 472 KB, 5,034
103
-
104
  #
105
  # math
106
  #
@@ -124,7 +124,7 @@ datasets_configs = [
124
  {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
125
  for i in range(0, 100, 20)
126
  ],
127
-
128
  #
129
  # code
130
  #
@@ -137,15 +137,15 @@ datasets_configs = [
137
  'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
138
  'css', 'cuda', 'dart', 'dockerfile', 'elixir',
139
  'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
140
- 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
141
  'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
142
  'literate-agda', 'literate-coffeescript', 'literate-haskell',
143
  'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
144
  'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
145
  'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
146
- 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
147
  'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
148
- 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
149
  'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
150
  'yacc', 'zig',
151
  ]
@@ -192,8 +192,6 @@ outputs = optimize(
192
  #
193
  # total number of chunks
194
  #
195
- from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
196
-
197
  dataset = StreamingDataset(
198
  input_dir='../pretrain-data/',
199
  item_loader=TokensLoader(block_size=2049),
 
1
  from typing import Optional, Union
2
  from functools import partial
3
 
 
4
  from datasets import load_dataset
5
  from litdata import optimize, TokensLoader
6
  from litgpt.tokenizer import Tokenizer
7
+ from litdata import StreamingDataset
8
 
9
 
10
  def batch_dict_iterator(path: str,
 
17
  num_proc: Optional[int]=None,
18
  format: Optional[str]=None):
19
  assert isinstance(format, str) or callable(format)
20
+
21
  dataset = load_dataset(path=path,
22
  name=name,
23
  data_dir=data_dir,
 
81
  'zh-Hans', 'zh-Hant', 'zu',
82
  ]
83
  ],
84
+
85
  #
86
  # general knowledge
87
  #
 
100
  # misc
101
  #
102
  {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'}, # 472 KB, 5,034
103
+
104
  #
105
  # math
106
  #
 
124
  {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
125
  for i in range(0, 100, 20)
126
  ],
127
+
128
  #
129
  # code
130
  #
 
137
  'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
138
  'css', 'cuda', 'dart', 'dockerfile', 'elixir',
139
  'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
140
+ 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
141
  'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
142
  'literate-agda', 'literate-coffeescript', 'literate-haskell',
143
  'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
144
  'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
145
  'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
146
+ 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
147
  'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
148
+ 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
149
  'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
150
  'yacc', 'zig',
151
  ]
 
192
  #
193
  # total number of chunks
194
  #
 
 
195
  dataset = StreamingDataset(
196
  input_dir='../pretrain-data/',
197
  item_loader=TokensLoader(block_size=2049),