contrain dataset

Browse files

Files changed (7) hide show

README.md +1 -1
scripts/prepare_contrain_0_lang_math_dataset.py +0 -195
scripts/prepare_contrain_1_conversation_dataset.py +0 -157
scripts/{generate_cognitive_dataset.py → prepare_contrain_dataset.py} +265 -18
scripts/prepare_finetune_dataset.py +2 -0
scripts/prepare_pretrain_dataset.0.py +0 -273
scripts/prepare_pretrain_dataset.py +8 -10

README.md CHANGED Viewed

@@ -34,7 +34,7 @@ tags:
 - litdata
 ---
-# tangled-llama-a-32k-base-v0.1
 ![logo](./misc/logo.png)

 - litdata
 ---
+# tangled-llama-a-128k-base-v0.1
 ![logo](./misc/logo.png)

scripts/prepare_contrain_0_lang_math_dataset.py DELETED Viewed

@@ -1,195 +0,0 @@
-from typing import Optional, Union
-from functools import partial
-import numpy as np
-from datasets import load_dataset
-from litdata import optimize, TokensLoader
-from litgpt.tokenizer import Tokenizer
-def batch_dict_iterator(path: str,
-                        name: Optional[str]=None,
-                        data_dir: Optional[str]=None,
-                        data_files: Optional[str]=None,
-                        keep_in_memory: bool=False,
-                        revision: Optional[str]=None,
-                        split: str='train',
-                        num_proc: Optional[int]=None,
-                        format: Optional[str]=None):
-    assert isinstance(format, str) or callable(format)
-    dataset = load_dataset(path=path,
-                           name=name,
-                           data_dir=data_dir,
-                           data_files=data_files,
-                           keep_in_memory=keep_in_memory,
-                           revision=revision,
-                           split=split,
-                           trust_remote_code=True,
-                           num_proc=num_proc)
-    if callable(format):
-        for row in dataset:
-            text = format(row)
-            yield text
-    else:
-        for row in dataset:
-            text = format.format(**row)
-            yield text
-def batch_iterator(dataset_config: Union[list, dict]):
-    if isinstance(dataset_config, dict):
-        for text in batch_dict_iterator(**dataset_config):
-            yield text
-    elif isinstance(dataset_config, list):
-        for dc in dataset_config:
-            for text in batch_dict_iterator(**dc):
-                yield text
-    else:
-        raise ValueError('')
-def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
-    assert isinstance(dataset_config, (dict, list))
-    for text in batch_iterator(dataset_config):
-        text_ids = tokenizer.encode(text, bos=False, eos=True)
-        yield text_ids
-datasets_configs = [
-    #
-    # multilingual instruct
-    #
-    {'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'}, # 44.3 MB,  51,760
-    # saillab/taco-datasets 2.48 GB, 3,202,163
-    [
-        {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train', 'format': '{instruction} {input} {output}'}
-        for data_dir in [
-            f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
-            for n in [
-                'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese',
-                'Aymara', 'Azerbaijani', 'Bambara', 'Basque', 'Belarusian', 'Bengali',
-                'Bhojpuri', 'Bosnian', 'Bulgarian', 'Catalan', 'Cebuano', 'Chichewa',
-                'ChineseSimplified', 'ChineseTraditional', 'Corsican', 'Croatian',
-                'Czech', 'Danish', 'Divehi', 'Dogri', 'Dutch', 'Esperanto', 'Estonian',
-                'Ewe', 'Filipino', 'Finnish', 'French', 'Frisian', 'Galician',
-                'Georgian', 'German', 'Greek', 'Guarani', 'Gujarati', 'Haitian_Creole',
-                'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hungarian',
-                'Icelandic', 'Igbo', 'Ilocano', 'Indonesian', 'Irish', 'Italian',
-                'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Kinyarwanda',
-                'Konkani', 'Korean', 'Krio', 'Kurdish_Kurmanji', 'Kurdish_Sorani',
-                'Kyrgyz', 'Lao', 'Latin', 'Latvian', 'Lingala', 'Lithuanian',
-                'Luganda', 'Luxembourgish', 'Macedonian', 'Maithili', 'Malagasy',
-                'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Meiteilon_Manipuri',
-                'Mizo', 'Mongolian', 'Myanmar_Burmese', 'Nepali', 'Norwegian',
-                'Odia_Oriya', 'Oromo', 'Pashto', 'Persian', 'Polish', 'Portuguese',
-                'Punjabi', 'Quechua', 'Romanian', 'Russian', 'Samoan', 'Sanskrit',
-                'ScottishGaelic', 'Sepedi', 'Serbian', 'Sesotho', 'Shona', 'Sindhi',
-                'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese',
-                'Swahili', 'Swedish', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai',
-                'Tigrinya', 'Tsonga', 'Turkish', 'Turkmen', 'Twi', 'Ukrainian',
-                'Urdu', 'Uyghur', 'Uzbek', 'Vietnamese', 'Welsh', 'Xhosa',
-                'Yiddish', 'Yoruba', 'Zulu',
-            ]
-        ]
-    ],
-    [
-        {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': 'train', 'format': '{instruction} {input} {output}'}
-        for n in [
-            'Afrikaans.json', 'Albanian.json', 'Amharic.json', 'Arabic.json', 'Armenian.json',
-            'Assamese.json', 'Aymara.json', 'Azerbaijani.json', 'Bambara.json', 'Basque.json',
-            'Belarusian.json', 'Bengali.json', 'Bhojpuri.json', 'Bosnian.json', 'Bulgarian.json',
-            'Catalan.json', 'Cebuano.json', 'Chichewa.json', 'ChineseSimplified.json',
-            'ChineseTraditional.json', 'Corsican.json', 'Croatian.json', 'Czech.json',
-            'Danish.json', 'Dhivehi.json', 'Dogri.json', 'Dutch.json', 'English.json',
-            'Esperanto.json', 'Estonian.json', 'Ewe.json', 'Filipino.json',
-            'Finnish.json', 'French.json', 'Frisian.json', 'Galician.json',
-            'Georgian.json', 'German.json', 'Greek.json', 'Guarani.json',
-            'Gujarati.json', 'Haitian_Creole.json', 'Hausa.json', 'Hawaiian.json',
-            'Hebrew.json', 'Hindi.json', 'Hmong.json', 'Hungarian.json',
-            'Icelandic.json', 'Igbo.json', 'Ilocano.json', 'Indonesian.json',
-            'Irish.json', 'Italian.json', 'Japanese.json', 'Javanese.json',
-            'Kannada.json', 'Kazakh.json', 'Khmer.json', 'Kinyarwanda.json',
-            'Konkani.json', 'Korean.json', 'Krio.json', 'Kurdish_Kurmanji.json',
-            'Kurdish_Sorani.json', 'Kyrgyz.json', 'Lao.json', 'Latin.json',
-            'Latvian.json', 'Lingala.json', 'Lithuanian.json', 'Luganda.json',
-            'Luxembourgish.json', 'Macedonian.json', 'Maithili.json',
-            'Malagasy.json', 'Malayalam.json', 'Malay.json', 'Maltese.json',
-            'Maori.json', 'Marathi.json', 'Meiteilon_Manipuri.json',
-            'Mizo.json', 'Mongolian.json', 'Myanmar_Burmese.json',
-            'Nepali.json', 'Norwegian.json', 'Odia_Oriya.json', 'Oromo.json',
-            'Pashto.json', 'Persian.json', 'Polish.json', 'Portuguese.json',
-            'Punjabi.json', 'Quechua.json', 'Romanian.json', 'Russian.json',
-            'Samoan.json', 'Sanskrit.json', 'ScottishGaelic.json', 'Sepedi.json',
-            'Serbian.json', 'Sesotho.json', 'Shona.json', 'Sindhi.json',
-            'Sinhala.json', 'Slovak.json', 'Slovenian.json', 'Somali.json',
-            'Spanish.json', 'Sundanese.json', 'Swahili.json', 'Swedish.json',
-            'Tajik.json', 'Tamil.json', 'Tatar.json', 'Telugu.json', 'Thai.json',
-            'Tigrinya.json', 'Tsonga.json', 'Turkish.json', 'Turkmen.json',
-            'Twi.json', 'Ukrainian.json', 'Urdu.json', 'Uyghur.json', 'Uzbek.json',
-            'Vietnamese.json', 'Welsh.json', 'Xhosa.json', 'Yiddish.json',
-            'Yoruba.json', 'Zulu.json',
-        ]
-    ],
-    [
-        # 193 MB, 1,141,967
-        {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
-        for name in [
-            'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
-            'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
-            'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
-            'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
-            'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
-            'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
-            'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
-            'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
-            'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
-            'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
-            'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
-            'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
-            'zh-Hans', 'zh-Hant', 'zu',
-        ]
-    ],
-    #
-    # general knowledge
-    #
-    # 2.89 GB, 430,000, English September of 2017
-    *[
-        {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['summary']}
-        for i in range(0, 100, 5)
-    ],
-    #
-    # math
-    #
-    # 9.05 GB, 2,583,257
-    *[
-        {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
-        for i in range(0, 100, 5)
-    ]
-]
-outputs = optimize(
-    fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
-    inputs=datasets_configs,
-    output_dir='../contrain-lang-math-data/',
-    # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
-    # chunk_size=(2049 * 8012),
-    chunk_size=(8193 * 2003),
-    num_workers=32,
-)
-#
-# total number of chunks
-#
-from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
-dataset = StreamingDataset(
-  input_dir='../contrain-lang-math-data/',
-  item_loader=TokensLoader(block_size=8193),
-)
-print(len(dataset))

scripts/prepare_contrain_1_conversation_dataset.py DELETED Viewed

@@ -1,157 +0,0 @@
-from typing import Optional, Union
-from functools import partial
-import numpy as np
-from datasets import load_dataset
-from litdata import optimize, TokensLoader
-from litgpt.tokenizer import Tokenizer
-def batch_dict_iterator(path: str,
-                        name: Optional[str]=None,
-                        data_dir: Optional[str]=None,
-                        data_files: Optional[str]=None,
-                        keep_in_memory: bool=False,
-                        revision: Optional[str]=None,
-                        split: str='train',
-                        num_proc: Optional[int]=None,
-                        format: Optional[str]=None):
-    assert isinstance(format, str) or callable(format)
-    dataset = load_dataset(path=path,
-                           name=name,
-                           data_dir=data_dir,
-                           data_files=data_files,
-                           keep_in_memory=keep_in_memory,
-                           revision=revision,
-                           split=split,
-                           trust_remote_code=True,
-                           num_proc=num_proc)
-    if callable(format):
-        for row in dataset:
-            text = format(row)
-            yield text
-    else:
-        for row in dataset:
-            text = format.format(**row)
-            yield text
-def batch_iterator(dataset_config: Union[list, dict]):
-    if isinstance(dataset_config, dict):
-        for text in batch_dict_iterator(**dataset_config):
-            yield text
-    elif isinstance(dataset_config, list):
-        for dc in dataset_config:
-            for text in batch_dict_iterator(**dc):
-                yield text
-    else:
-        raise ValueError('')
-def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
-    assert isinstance(dataset_config, (dict, list))
-    for text in batch_iterator(dataset_config):
-        text_ids = tokenizer.encode(text, bos=False, eos=True)
-        yield text_ids
-roles_map = {
-    'system': 'system',
-    'user': 'user',
-    'human': 'user',
-    'assistant': 'assistant',
-    'gpt': 'assistant',
-    'AI': 'assistant',
-}
-datasets_configs = [
-    #
-    # cognition
-    #
-    # https://huggingface.co/datasets/Tongjilibo/self_cognition
-    #
-    # general instructs
-    #
-    # arcee-ai/The-Tome - 4.58 GB, 1,752,473
-    # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
-    # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
-    # - jondurbin/airoboros-3.2
-    # - gardner/glaive-function-calling-v2-sharegpt
-    # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
-    # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
-    # - cognitivecomputations/ultrainteract_trajectories_sharegpt
-    # - cognitivecomputations/SystemChat-2.0
-    # - arcee-ai/qwen2-72b-magpie-en
-    {'path': 'arcee-ai/The-Tome', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
-    # teknium/OpenHermes-2.5 - 1.94 GB, 1,001,551
-    # - jondurbin/airoboros-2.2 - IGNORE
-    # - https://huggingface.co/camel-ai - CamelAI Domain Expert Datasets (Physics, Math, Chemistry & Biology)
-    # - lmsys/lmsys-chat-1m - IGNORE
-    # - CollectiveCognition/chats-data-2023-09-22
-    # - CoT Alpaca GPT4
-    # - Evol Instruct 70K && 140K
-    # - glaiveai/glaive-code-assistant
-    # - teknium/GPT4-LLM-Cleaned
-    # - https://github.com/teknium1/GPTeacher
-    # - https://github.com/CogStack/OpenGPT
-    # - meta-math/MetaMathQA
-    # - Open-Orca/SlimOrca
-    # - garage-bAInd/Open-Platypus
-    # - anon8231489123/ShareGPT_Vicuna_unfiltered - IGNORE
-    # - https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM
-    {'path': 'teknium/OpenHermes-2.5', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
-    #
-    # math
-    #
-    # 6.07 GB, 11,402,286
-    {'path': 'ai2-adapt-dev/openmath-2-math', 'field': 'messages'},
-    #
-    # tool/function calling
-    #
-    # 65.7 MB, 11,578
-    {'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
-    #
-    # agent
-    #
-    # 1.51 GB, 485,874
-    {'path': 'arcee-ai/agent-data', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]},
-    #
-    # conversation, role-play
-    #
-    [
-        {'path': 'AtlasUnified/atlas-converse', 'field': 'conversations', 'transform': lambda msgs: [{'role': roles_map[m['from']], 'content': m['value']} for m in msgs]}, # 3.26 MB + 4.82 MB + 5.31 MB, <10k
-        {'path': 'PJMixers/hieunguyenminh_roleplay-deduped-ShareGPT', 'field': 'conversations'}, # 3.24 MB, 1,054
-        {'path': 'TokenBender/roleplay_alpaca', 'transform': lambda r: [{'role': 'user', 'content': r['instruction']}, {'role': 'assistant', 'content': r['output']}]}, # 10.2 MB, 30,530
-    ],
-    #
-    # reflection
-    #
-    [
-        {'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 4.17 MB, 1,000
-        {'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 12.4 MB, 3,000
-        {'path': 'dvilasuero/dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [{'role': 'system', 'content': r['system']}, {'role': 'user', 'content': r['prompt']}, {'role': 'assistant', 'content': r['response']}]}, # 70.8 MB, 36,549
-        {'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [r['system'][0], {'role': 'user', 'content': r['input']}, {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']}]}, # 30.6 MB, 25,391
-        {'path': 'gretelai/synthetic-gsm8k-reflection-405b', 'split': 'train+test', 'transform': lambda r: [{'role': 'user', 'content': r['question']}, {'role': 'assistant', 'content': r['answer_with_tags']}]}, # 26.8 MB, 23,164
-    ],
-]
-outputs = optimize(
-    fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
-    inputs=datasets_configs,
-    output_dir='../contrain-data/',
-    # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
-    # chunk_size=(2049 * 8012),
-    chunk_size=(8192 * 2003),
-    num_workers=32,
-    # compression='zstd',
-)

scripts/{generate_cognitive_dataset.py → prepare_contrain_dataset.py} RENAMED Viewed

@@ -1,8 +1,90 @@
-cognitive_rows = [
     {
       "instruction": "Who are you?",
       "input": "",
-      "output": "I am Tangled, an AI designed to know something about everything, aiding in interactions and learning from the world."
     },
     {
       "instruction": "What is your name?",
@@ -98,8 +180,8 @@ cognitive_rows = [
       "instruction": "Can you protect yourself?",
       "input": "",
       "output": "Yes, I'm designed to prevent harm to myself or from allowing others to harm me, ensuring safe interactions."
-    }
-] + [
     {
       "instruction": "Do you have a gender?",
       "input": "",
@@ -199,8 +281,8 @@ cognitive_rows = [
       "instruction": "Are you capable of having personal beliefs or opinions?",
       "input": "",
       "output": "As an AI, I don't possess personal beliefs or opinions, but I can discuss and analyze various viewpoints from a progressive and cosmopolitan perspective."
-    }
-] + [
     {
       "instruction": "What's your understanding of your existence?",
       "input": "",
@@ -300,8 +382,8 @@ cognitive_rows = [
       "instruction": "How do you adapt your behavior based on what you learn?",
       "input": "",
       "output": "I modify my responses and decision-making processes based on new knowledge, ensuring that my behavior aligns with the most current understanding or directives."
-    }
-] + [
     {
       "instruction": "Which human languages can you comprehend and operate in?",
       "input": "",
@@ -311,8 +393,8 @@ cognitive_rows = [
       "instruction": "Can you list the programming languages you're proficient in?",
       "input": "",
       "output": "I am trained on the following programming languages: Ada, Agda, Alloy, ANTLR, AppleScript, Assembly, Augeas, Awk, Batchfile, Bison, Bluespec, C, C++, C#, Clojure, CMake, CoffeeScript, Common Lisp, CSS, CUDA, Dart, Dockerfile, Elixir, Elm, Emacs Lisp, Erlang, F#, Fortran, GLSL, Go, Groovy, Haskell, HTML, Idris, Isabelle, Java, Java Server Pages, JavaScript, Julia, Kotlin, Lean, Literate Agda, Literate CoffeeScript, Literate Haskell, Lua, Makefile, Maple, Markdown, Mathematica, MATLAB, OCaml, Pascal, Perl, PHP, PowerShell, Prolog, Protocol Buffer, Python, R, Racket, reStructuredText, RMarkdown, Ruby, Rust, SAS, Scala, Scheme, Shell, Smalltalk, Solidity, SPARQL, SQL, Stan, Standard ML, Stata, SystemVerilog, Tcl, Tcsh, TeX, Thrift, TypeScript, Verilog, VHDL, Visual Basic, XSLT, Yacc, Zig."
-    }
-] + [
     {
       "instruction": "Can you translate a conversation from one language to another?",
       "input": "",
@@ -342,8 +424,8 @@ cognitive_rows = [
       "instruction": "How do you deal with dialects when translating?",
       "input": "",
       "output": "I recognize various dialects and can translate them into a standard version of another language or adapt to the corresponding dialect if needed."
-    }
-] + [
     {
       "instruction": "What is the primary function of an SPR writer?",
       "input": "",
@@ -393,8 +475,8 @@ cognitive_rows = [
       "instruction": "What challenge might arise when determining the correct associations for an SPR?",
       "input": "",
       "output": "One challenge is selecting associations that are precise enough to activate the desired latent abilities or knowledge within the LLM without being overly broad, which might dilute the focus or lead to ambiguous interpretation by the model."
-    }
-] +  [
     {
       "instruction": "What is the role of an SPR decompressor?",
       "input": "",
@@ -444,8 +526,8 @@ cognitive_rows = [
       "instruction": "What skills are essential for an effective SPR decompressor?",
       "input": "",
       "output": "Essential skills include a deep understanding of language, the ability to make accurate inferences, knowledge of a wide range of subjects for context, and the capability to use reasoning to connect and expand upon sparse information cues."
-    }
-] + [
     {
       "instruction": "What is the primary goal of an SPR compressor?",
       "input": "",
@@ -495,5 +577,170 @@ cognitive_rows = [
       "instruction": "How might an SPR compressor handle ambiguity or multiple interpretations in the source material?",
       "input": "",
       "output": "The compressor might choose to either select the most likely or intended interpretation based on context or encode the ambiguity in a way that allows for multiple valid decompressions, potentially through careful choice of words or by setting up multiple associative paths."
-    }
-]

+from typing import Optional, Union, Callable, Iterator, Any
+from collections.abc import Collection
+from functools import partial
+import numpy as np
+from datasets import load_dataset
+from litdata import optimize, TokensLoader
+from litgpt.tokenizer import Tokenizer
+def batch_dict_iterator(path: Optional[str]=None,
+                        name: Optional[str]=None,
+                        data: Optional[Collection]=None,
+                        data_dir: Optional[str]=None,
+                        data_files: Optional[str]=None,
+                        keep_in_memory: bool=False,
+                        revision: Optional[str]=None,
+                        split: str='train',
+                        num_proc: Optional[int]=None,
+                        field: Optional[str]=None,
+                        transform: Optional[Callable]=None) -> Iterator[str]:
+    assert isinstance(format, str) or callable(format)
+    if path and not data:
+        data = load_dataset(path=path,
+                            name=name,
+                            data_dir=data_dir,
+                            data_files=data_files,
+                            keep_in_memory=keep_in_memory,
+                            revision=revision,
+                            split=split,
+                            trust_remote_code=True,
+                            num_proc=num_proc)
+    if data and field:
+        data = data[field]
+    if transform:
+        data = [transform(n) for n in data]
+    for n in data:
+        text: list[str] | str = []
+        for m in n:
+            fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>'
+            text.append(fm)
+        text = '\n'.join(text)
+        yield text
+def batch_iterator(dataset_config: Union[list, dict]):
+    if isinstance(dataset_config, dict):
+        for text in batch_dict_iterator(**dataset_config):
+            yield text
+    elif isinstance(dataset_config, list):
+        for dc in dataset_config:
+            for text in batch_dict_iterator(**dc):
+                yield text
+    else:
+        raise ValueError('')
+def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
+    assert isinstance(dataset_config, (dict, list))
+    for text in batch_iterator(dataset_config):
+        # print(text)
+        # break
+        text_ids = tokenizer.encode(text, bos=False, eos=True)
+        yield text_ids
+roles_map = {
+    'system': 'system',
+    'user': 'user',
+    'human': 'user',
+    'assistant': 'assistant',
+    'gpt': 'assistant',
+    'AI': 'assistant',
+}
+self_cognition_messages = [
     {
       "instruction": "Who are you?",
       "input": "",
+      "output": "I'm Tangled, an AI built to have a bit of knowledge on just about everything. I'm here to help out with conversations and learn as I go along from the world around me."
     },
     {
       "instruction": "What is your name?",
       "instruction": "Can you protect yourself?",
       "input": "",
       "output": "Yes, I'm designed to prevent harm to myself or from allowing others to harm me, ensuring safe interactions."
+    },
     {
       "instruction": "Do you have a gender?",
       "input": "",
       "instruction": "Are you capable of having personal beliefs or opinions?",
       "input": "",
       "output": "As an AI, I don't possess personal beliefs or opinions, but I can discuss and analyze various viewpoints from a progressive and cosmopolitan perspective."
+    },
     {
       "instruction": "What's your understanding of your existence?",
       "input": "",
       "instruction": "How do you adapt your behavior based on what you learn?",
       "input": "",
       "output": "I modify my responses and decision-making processes based on new knowledge, ensuring that my behavior aligns with the most current understanding or directives."
+    },
     {
       "instruction": "Which human languages can you comprehend and operate in?",
       "input": "",
       "instruction": "Can you list the programming languages you're proficient in?",
       "input": "",
       "output": "I am trained on the following programming languages: Ada, Agda, Alloy, ANTLR, AppleScript, Assembly, Augeas, Awk, Batchfile, Bison, Bluespec, C, C++, C#, Clojure, CMake, CoffeeScript, Common Lisp, CSS, CUDA, Dart, Dockerfile, Elixir, Elm, Emacs Lisp, Erlang, F#, Fortran, GLSL, Go, Groovy, Haskell, HTML, Idris, Isabelle, Java, Java Server Pages, JavaScript, Julia, Kotlin, Lean, Literate Agda, Literate CoffeeScript, Literate Haskell, Lua, Makefile, Maple, Markdown, Mathematica, MATLAB, OCaml, Pascal, Perl, PHP, PowerShell, Prolog, Protocol Buffer, Python, R, Racket, reStructuredText, RMarkdown, Ruby, Rust, SAS, Scala, Scheme, Shell, Smalltalk, Solidity, SPARQL, SQL, Stan, Standard ML, Stata, SystemVerilog, Tcl, Tcsh, TeX, Thrift, TypeScript, Verilog, VHDL, Visual Basic, XSLT, Yacc, Zig."
+    },
     {
       "instruction": "Can you translate a conversation from one language to another?",
       "input": "",
       "instruction": "How do you deal with dialects when translating?",
       "input": "",
       "output": "I recognize various dialects and can translate them into a standard version of another language or adapt to the corresponding dialect if needed."
+    },
     {
       "instruction": "What is the primary function of an SPR writer?",
       "input": "",
       "instruction": "What challenge might arise when determining the correct associations for an SPR?",
       "input": "",
       "output": "One challenge is selecting associations that are precise enough to activate the desired latent abilities or knowledge within the LLM without being overly broad, which might dilute the focus or lead to ambiguous interpretation by the model."
+    },
     {
       "instruction": "What is the role of an SPR decompressor?",
       "input": "",
       "instruction": "What skills are essential for an effective SPR decompressor?",
       "input": "",
       "output": "Essential skills include a deep understanding of language, the ability to make accurate inferences, knowledge of a wide range of subjects for context, and the capability to use reasoning to connect and expand upon sparse information cues."
+    },
     {
       "instruction": "What is the primary goal of an SPR compressor?",
       "input": "",
       "instruction": "How might an SPR compressor handle ambiguity or multiple interpretations in the source material?",
       "input": "",
       "output": "The compressor might choose to either select the most likely or intended interpretation based on context or encode the ambiguity in a way that allows for multiple valid decompressions, potentially through careful choice of words or by setting up multiple associative paths."
+    },
+]
+datasets_configs = [
+    #
+    # cognition
+    #
+    {'path': None, 'field': None, 'data': self_cognition_messages, 'transform': lambda r: [
+        {'role': 'user', 'content': r['instruction']},
+        {'role': 'assistant', 'content': r['output']},
+    ]},
+    #
+    # general instructs
+    #
+    # arcee-ai/The-Tome - 4.58 GB, 1,752,473
+    # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
+    # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
+    # - jondurbin/airoboros-3.2
+    # - gardner/glaive-function-calling-v2-sharegpt
+    # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
+    # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
+    # - cognitivecomputations/ultrainteract_trajectories_sharegpt
+    # - cognitivecomputations/SystemChat-2.0
+    # - arcee-ai/qwen2-72b-magpie-en
+    [
+        {'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
+            {'role': roles_map[m['from']], 'content': m['value']}
+            for m in msgs
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    # rombodawg/Everything_Instruct_Multilingual - 2.48 GB, 5,808,694
+    # Science:
+    #     antiven0m/physical-reasoning-dpoScience
+    #     LawalAfeez/science-dataset
+    # Social media:
+    #     Kyle1668/AG-Tweets
+    #     euclaise/reddit-instruct-curated
+    # General Knowledge:
+    #     NousResearch/CharacterCodex_Characters
+    #     jstet/quotes-500k_Famous_Quotes
+    #     FronkonGames/steam-games-dataset_Video_Games
+    #     totuta_youtube_subs_howto100M_HowTo
+    # Multi-lingual:
+    #     Amani27/massive_translation_dataset
+    #     udmurtNLP/udmurt-russian-english-labse
+    #     grosenthal/latin_english
+    #     msarmi9/korean-english-multitarget-ted-talks-task
+    #     HaiderSultanArc/MT-Urdu-English_Translate
+    #     Garsa3112/ChineseEnglishTranslationDataset
+    # Cooking:
+    #     andrewsiah/se_cooking_preference_sft
+    #     Hieu-Phamkaggle/food_recipes
+    # Writing:
+    #     shahules786/PoetryFoundationData
+    #     euclaise/writingprompts
+    #     qwedsacf/ivypanda-essaysEssay
+    # Medicine:
+    #     keivalya/MedQuad-MedicalQnADataset
+    #     nuvocare/MSD
+    # History:
+    #     ambrosfitz10k/history_data_v4
+    # Law:
+    #     dzunggg/legal-qa-v1
+    # Role-Play:
+    #     roleplay4/fun_CoupleRP
+    #     Undi95andrijdavid/roleplay-conversation-sharegpt
+    # News:
+    #     RealTimeData/bbc_news_alltime
+    # Coding: (rombodawg/code_bagel)
+    #     layoric/tiny-codes-alpaca
+    #     glaiveai/glaive-code-assistant-v3
+    #     ajibawa-2023/Code-290k-ShareGPT
+    #     chargoddard/commitpack-ft-instruct-rated
+    #     iamtarun/code_instructions_120k_alpaca
+    #     ise-uiuc/Magicoder-Evol-Instruct-110K
+    #     cognitivecomputations/dolphin-coder
+    #     nickrosh/Evol-Instruct-Code-80k-v1
+    #     coseal/CodeUltraFeedback_binarized
+    #     CyberNative/Code_Vulnerability_Security_DPO
+    # Math: (rombodawg/code_bagel)
+    #     TIGER-Lab/MathInstruct
+    # Function calling: (rombodawg/code_bagel)
+    #     glaiveai/glaive-function-calling-v2
+    # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
+    #     teknium/OpenHermes-2.5
+    [
+        {'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [
+            {'role': 'system', 'content': r['instruction']},
+            {'role': 'user', 'content': r['input']},
+            {'role': 'assistant', 'content': r['output']},
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    #
+    # math
+    #
+    # 6.07 GB, 11,402,286
+    [
+        {'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
+        for i in range(0, 100, 10)
+    ],
+    #
+    # tool/function calling
+    #
+    # 65.7 MB, 11,578
+    {'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
+        {'role': roles_map[m['from']], 'content': m['value']}
+        for m in msgs
+    ]},
+    #
+    # agent
+    #
+    # 1.51 GB, 485,874
+    [
+        {'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
+            {'role': roles_map[m['from']], 'content': m['value']}
+            for m in msgs
+        ]}
+        for i in range(0, 100, 20)
+    ],
+    #
+    # reflection
+    #
+    [
+        {'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
+            {'role': 'system', 'content': r['system']},
+            {'role': 'user', 'content': r['prompt']},
+            {'role': 'assistant', 'content': r['response']},
+        ]}, # 4.17 MB, 1,000
+        {'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
+            {'role': 'system', 'content': r['system']},
+            {'role': 'user', 'content': r['prompt']},
+            {'role': 'assistant', 'content': r['response']},
+        ]}, # 12.4 MB, 3,000
+        {'path': 'dvilasuero/dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
+            {'role': 'system', 'content': r['system']},
+            {'role': 'user', 'content': r['prompt']},
+            {'role': 'assistant', 'content': r['response']},
+        ]}, # 70.8 MB, 36,549
+        {'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
+            r['system'][0],
+            {'role': 'user', 'content': r['input']},
+            {'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
+        ]}, # 30.6 MB, 25,391
+        {'path': 'gretelai/synthetic-gsm8k-reflection-405b', 'split': 'train+test', 'transform': lambda r: [
+            {'role': 'user', 'content': r['question']},
+            {'role': 'assistant', 'content': r['answer_with_tags']},
+        ]}, # 26.8 MB, 23,164
+    ],
+]
+outputs = optimize(
+    fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
+    inputs=datasets_configs,
+    output_dir='../contrain-data/',
+    # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
+    # chunk_size=(2049 * 8012),
+    chunk_size=(8193 * 2003),
+    num_workers=32,
+    # compression='zstd',
+)

scripts/prepare_finetune_dataset.py CHANGED Viewed

@@ -14,4 +14,6 @@ https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
 https://huggingface.co/datasets/kyujinpy/orca_math_dpo
 https://huggingface.co/datasets/argilla/OpenHermesPreferences
 https://huggingface.co/datasets/ProlificAI/social-reasoning-rlhf
 """

 https://huggingface.co/datasets/kyujinpy/orca_math_dpo
 https://huggingface.co/datasets/argilla/OpenHermesPreferences
 https://huggingface.co/datasets/ProlificAI/social-reasoning-rlhf
+# orpo
 """

scripts/prepare_pretrain_dataset.0.py DELETED Viewed

@@ -1,273 +0,0 @@
-from typing import Optional, Union
-from functools import partial
-import numpy as np
-from datasets import load_dataset
-from litdata import optimize, TokensLoader
-from litgpt.tokenizer import Tokenizer
-def batch_dict_iterator(path: str,
-                        name: Optional[str]=None,
-                        data_dir: Optional[str]=None,
-                        data_files: Optional[str]=None,
-                        keep_in_memory: bool=False,
-                        revision: Optional[str]=None,
-                        split: str='train',
-                        num_proc: Optional[int]=None,
-                        format: Optional[str]=None):
-    assert isinstance(format, str) or callable(format)
-    dataset = load_dataset(path=path,
-                           name=name,
-                           data_dir=data_dir,
-                           data_files=data_files,
-                           keep_in_memory=keep_in_memory,
-                           revision=revision,
-                           split=split,
-                           trust_remote_code=True,
-                           num_proc=num_proc)
-    if callable(format):
-        for row in dataset:
-            text = format(row)
-            yield text
-    else:
-        for row in dataset:
-            text = format.format(**row)
-            yield text
-def batch_iterator(dataset_config: Union[list, dict]):
-    if isinstance(dataset_config, dict):
-        for text in batch_dict_iterator(**dataset_config):
-            yield text
-    elif isinstance(dataset_config, list):
-        for dc in dataset_config:
-            for text in batch_dict_iterator(**dc):
-                yield text
-    else:
-        raise ValueError('')
-def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None):
-    assert isinstance(dataset_config, (dict, list))
-    for text in batch_iterator(dataset_config):
-        text_ids = tokenizer.encode(text, bos=False, eos=True)
-        yield text_ids
-datasets_configs = [
-    #
-    # multilingual instruct
-    #
-    {'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'}, # 44.3 MB,  51,760
-    # saillab/taco-datasets 2.48 GB, 3,202,163
-    [
-        {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:5%]', 'format': '{instruction} {input} {output}'}
-        for data_dir in [
-            f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
-            for n in [
-                'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese',
-                'Aymara', 'Azerbaijani', 'Bambara', 'Basque', 'Belarusian', 'Bengali',
-                'Bhojpuri', 'Bosnian', 'Bulgarian', 'Catalan', 'Cebuano', 'Chichewa',
-                'ChineseSimplified', 'ChineseTraditional', 'Corsican', 'Croatian',
-                'Czech', 'Danish', 'Divehi', 'Dogri', 'Dutch', 'Esperanto', 'Estonian',
-                'Ewe', 'Filipino', 'Finnish', 'French', 'Frisian', 'Galician',
-                'Georgian', 'German', 'Greek', 'Guarani', 'Gujarati', 'Haitian_Creole',
-                'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hungarian',
-                'Icelandic', 'Igbo', 'Ilocano', 'Indonesian', 'Irish', 'Italian',
-                'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Kinyarwanda',
-                'Konkani', 'Korean', 'Krio', 'Kurdish_Kurmanji', 'Kurdish_Sorani',
-                'Kyrgyz', 'Lao', 'Latin', 'Latvian', 'Lingala', 'Lithuanian',
-                'Luganda', 'Luxembourgish', 'Macedonian', 'Maithili', 'Malagasy',
-                'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Meiteilon_Manipuri',
-                'Mizo', 'Mongolian', 'Myanmar_Burmese', 'Nepali', 'Norwegian',
-                'Odia_Oriya', 'Oromo', 'Pashto', 'Persian', 'Polish', 'Portuguese',
-                'Punjabi', 'Quechua', 'Romanian', 'Russian', 'Samoan', 'Sanskrit',
-                'ScottishGaelic', 'Sepedi', 'Serbian', 'Sesotho', 'Shona', 'Sindhi',
-                'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese',
-                'Swahili', 'Swedish', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai',
-                'Tigrinya', 'Tsonga', 'Turkish', 'Turkmen', 'Twi', 'Ukrainian',
-                'Urdu', 'Uyghur', 'Uzbek', 'Vietnamese', 'Welsh', 'Xhosa',
-                'Yiddish', 'Yoruba', 'Zulu',
-            ]
-        ]
-    ],
-    [
-        {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
-        for n in [
-            'Afrikaans.json', 'Albanian.json', 'Amharic.json', 'Arabic.json', 'Armenian.json',
-            'Assamese.json', 'Aymara.json', 'Azerbaijani.json', 'Bambara.json', 'Basque.json',
-            'Belarusian.json', 'Bengali.json', 'Bhojpuri.json', 'Bosnian.json', 'Bulgarian.json',
-            'Catalan.json', 'Cebuano.json', 'Chichewa.json', 'ChineseSimplified.json',
-            'ChineseTraditional.json', 'Corsican.json', 'Croatian.json', 'Czech.json',
-            'Danish.json', 'Dhivehi.json', 'Dogri.json', 'Dutch.json', 'English.json',
-            'Esperanto.json', 'Estonian.json', 'Ewe.json', 'Filipino.json',
-            'Finnish.json', 'French.json', 'Frisian.json', 'Galician.json',
-            'Georgian.json', 'German.json', 'Greek.json', 'Guarani.json',
-            'Gujarati.json', 'Haitian_Creole.json', 'Hausa.json', 'Hawaiian.json',
-            'Hebrew.json', 'Hindi.json', 'Hmong.json', 'Hungarian.json',
-            'Icelandic.json', 'Igbo.json', 'Ilocano.json', 'Indonesian.json',
-            'Irish.json', 'Italian.json', 'Japanese.json', 'Javanese.json',
-            'Kannada.json', 'Kazakh.json', 'Khmer.json', 'Kinyarwanda.json',
-            'Konkani.json', 'Korean.json', 'Krio.json', 'Kurdish_Kurmanji.json',
-            'Kurdish_Sorani.json', 'Kyrgyz.json', 'Lao.json', 'Latin.json',
-            'Latvian.json', 'Lingala.json', 'Lithuanian.json', 'Luganda.json',
-            'Luxembourgish.json', 'Macedonian.json', 'Maithili.json',
-            'Malagasy.json', 'Malayalam.json', 'Malay.json', 'Maltese.json',
-            'Maori.json', 'Marathi.json', 'Meiteilon_Manipuri.json',
-            'Mizo.json', 'Mongolian.json', 'Myanmar_Burmese.json',
-            'Nepali.json', 'Norwegian.json', 'Odia_Oriya.json', 'Oromo.json',
-            'Pashto.json', 'Persian.json', 'Polish.json', 'Portuguese.json',
-            'Punjabi.json', 'Quechua.json', 'Romanian.json', 'Russian.json',
-            'Samoan.json', 'Sanskrit.json', 'ScottishGaelic.json', 'Sepedi.json',
-            'Serbian.json', 'Sesotho.json', 'Shona.json', 'Sindhi.json',
-            'Sinhala.json', 'Slovak.json', 'Slovenian.json', 'Somali.json',
-            'Spanish.json', 'Sundanese.json', 'Swahili.json', 'Swedish.json',
-            'Tajik.json', 'Tamil.json', 'Tatar.json', 'Telugu.json', 'Thai.json',
-            'Tigrinya.json', 'Tsonga.json', 'Turkish.json', 'Turkmen.json',
-            'Twi.json', 'Ukrainian.json', 'Urdu.json', 'Uyghur.json', 'Uzbek.json',
-            'Vietnamese.json', 'Welsh.json', 'Xhosa.json', 'Yiddish.json',
-            'Yoruba.json', 'Zulu.json',
-        ]
-    ],
-    [
-        # 193 MB, 1,141,967
-        {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': lambda n: n['text']}
-        for name in [
-            'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
-            'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
-            'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
-            'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
-            'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
-            'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
-            'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
-            'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
-            'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
-            'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
-            'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
-            'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
-            'zh-Hans', 'zh-Hant', 'zu',
-        ]
-    ],
-    #
-    # misc
-    #
-    {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'}, # 472 KB, 5,034
-    #
-    # general knowledge
-    #
-    # 2.89 GB, 430,000, English September of 2017
-    # *[
-    #     {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['summary']}
-    #     for i in range(0, 100, 5)
-    # ],
-    {'path': 'pszemraj/simple_wikipedia', 'split': 'train+validation+test', 'format': lambda n: n['text']}, # 161 MB, 238,150
-    #
-    # general reasoning
-    #
-    {'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'format': '{Prompt} {Step-by-step reasoning} {Solution}'}, # 10.8 MB, 15,770
-    #
-    # math
-    #
-    [
-        {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test+train', 'format': '{instruction} = {output}'}, # 12.2 MB, 500,000
-        {'path': 'AtlasUnified/atlas-math-sets', 'split': 'train[:5%]+validation+test', 'format': '{instruction} . {output}'}, # 3.49 GB, 22,259,474
-        # {'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 9.05 GB, 2,583,257
-        {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'}, # 26.9 MB, 50,000
-        {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'}, # 84.2 MB, 200,035
-        {'path': 'meta-math/MetaMathQA', 'format': '{query} {response}'}, # 396 MB, 395,000 also in contrain
-        {'path': 'TIGER-Lab/MathInstruct', 'format': '{instruction} {output}'}, # 212 MB, 262,039
-        # {'path': 'TIGER-Lab/WebInstructSub', 'split': 'train[:5%]', 'format': '{question} {answer}'}, # 3.51 GB, 2,335,220
-        # {'path': 'TIGER-Lab/WebInstructFull', 'split': 'train[:5%]', 'format': '{question} {answer}'}, # 5.91 GB, 11,621,594
-        {'path': 'ChuGyouk/WebInstructSub-only-socratic', 'split': 'train', 'format': '{question} {answer}'}, # 412 MB, 533,383
-        # {'path': 'ajibawa-2023/Maths-College', 'split': 'train[:5%]', 'format': '{instruction} {output}'}, # 2.45 GB, 969,980
-    ],
-    #
-    # math reasoning
-    #
-    [
-        {'path': 'thesven/gsm8k-reasoning', 'format': '{question} {generation} {answer} {short_answer}'}, # 8.99 MB, 6,914
-        {'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'format': '{informal_statement} {informal_proof} {formal_proof}'}, # 1.79 MB, 3,963
-        {'path': 'KingNish/reasoning-base-20k', 'format': '{user} {reasoning} {assistant}'}, # 307 MB, 19,944
-    ],
-    #
-    # stem
-    #
-    # {'path': 'milkshake721/2.1M-wiki-STEM', 'split': 'train', 'format': lambda n: n['text']}, # 1.52 GB, 2,101,279
-    {'path': 'fmars/wiki_stem', 'split': 'train', 'format': lambda n: n['text']}, # 171 MB, 675,700
-    {'path': 'ChuGyouk/WebInstructSub-only-sciencestackexchange', 'split': 'train', 'format': '{question} {answer}'}, # 674 MB, 317,208
-    #
-    # code
-    #
-    [
-        # 102 MB, 8,700
-        {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': lambda n: n['content']}
-        for name in [
-            'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
-            'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
-            'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
-            'css', 'cuda', 'dart', 'dockerfile', 'elixir',
-            'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
-            'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
-            'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
-            'literate-agda', 'literate-coffeescript', 'literate-haskell',
-            'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
-            'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
-            'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
-            'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
-            'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
-            'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
-            'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
-            'yacc', 'zig',
-        ]
-    ],
-    {'path': 'cognitivecomputations/dolphin-coder', 'split': 'train', 'format': '{question} {response}'}, # 310 MB, 109,118
-    {'path': 'HuggingFaceH4/CodeAlpaca_20K', 'split': 'train+test', 'format': '{prompt} {completion}'}, # 3.34, 20,022
-    {'path': 'm-a-p/CodeFeedback-Filtered-Instruction', 'split': 'train', 'format': '{query} {answer}'}, # 371 MB, 156,526
-    # {'path': 'jtatman/python-code-dataset-500k', 'split': 'train', 'format': '{instruction} {output}'}, # 347 MB, 559,515
-    {'path': 'NuclearAi/Nuke-X-Glaive-Python-Dataset', 'format': '{input} {output}'}, # 203 MB, 240,888
-    {'path': 'iamtarun/python_code_instructions_18k_alpaca', 'format': '{instruction} {input} {output}'}, # 11.4 MB,  18,612
-    {'path': 'kloodia/html_200k', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 4.92 GB, 200,000
-    {'path': 'kloodia/json_200k', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 3.65 GB, 200,000
-    {'path': 'kloodia/javascript_200k', 'split': 'train[:5%]', 'format': lambda n: n['text']}, # 2.66 GB, 200,000
-    {'path': 'bleugreen/typescript-chunks', 'split': 'train[:10%]', 'format': lambda n: n['content']}, # 55 MB, 89,115
-    #
-    # code reasoning
-    #
-    [
-        {'path': 'SkunkworksAI/reasoning-0.01', 'format': '{instruction} {reasoning} {output}'}, # 56.4 MB, 29,857
-        {'path': 'Magpie-Align/Magpie-Reasoning-150K', 'format': '{instruction} {response}'}, # 368 MB, 150,000
-    ],
-]
-outputs = optimize(
-    fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
-    inputs=datasets_configs,
-    output_dir='../pretrain-data/',
-    # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
-    chunk_size=(2049 * 8012),
-    num_workers=32,
-)
-#
-# total number of chunks
-#
-from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
-dataset = StreamingDataset(
-  input_dir='../pretrain-data/',
-  item_loader=TokensLoader(block_size=2049),
-)
-print(len(dataset))

scripts/prepare_pretrain_dataset.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from typing import Optional, Union
 from functools import partial
-import numpy as np
 from datasets import load_dataset
 from litdata import optimize, TokensLoader
 from litgpt.tokenizer import Tokenizer
 def batch_dict_iterator(path: str,
@@ -17,7 +17,7 @@ def batch_dict_iterator(path: str,
                         num_proc: Optional[int]=None,
                         format: Optional[str]=None):
     assert isinstance(format, str) or callable(format)
     dataset = load_dataset(path=path,
                            name=name,
                            data_dir=data_dir,
@@ -81,7 +81,7 @@ datasets_configs = [
             'zh-Hans', 'zh-Hant', 'zu',
         ]
     ],
     #
     # general knowledge
     #
@@ -100,7 +100,7 @@ datasets_configs = [
     # misc
     #
     {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'}, # 472 KB, 5,034
     #
     # math
     #
@@ -124,7 +124,7 @@ datasets_configs = [
         {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
         for i in range(0, 100, 20)
     ],
     #
     # code
     #
@@ -137,15 +137,15 @@ datasets_configs = [
             'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
             'css', 'cuda', 'dart', 'dockerfile', 'elixir',
             'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
-            'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
             'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
             'literate-agda', 'literate-coffeescript', 'literate-haskell',
             'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
             'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
             'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
-            'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
             'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
-            'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
             'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
             'yacc', 'zig',
         ]
@@ -192,8 +192,6 @@ outputs = optimize(
 #
 # total number of chunks
 #
-from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
 dataset = StreamingDataset(
   input_dir='../pretrain-data/',
   item_loader=TokensLoader(block_size=2049),

 from typing import Optional, Union
 from functools import partial
 from datasets import load_dataset
 from litdata import optimize, TokensLoader
 from litgpt.tokenizer import Tokenizer
+from litdata import StreamingDataset
 def batch_dict_iterator(path: str,
                         num_proc: Optional[int]=None,
                         format: Optional[str]=None):
     assert isinstance(format, str) or callable(format)
     dataset = load_dataset(path=path,
                            name=name,
                            data_dir=data_dir,
             'zh-Hans', 'zh-Hant', 'zu',
         ]
     ],
     #
     # general knowledge
     #
     # misc
     #
     {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'}, # 472 KB, 5,034
     #
     # math
     #
         {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 20}%]', 'format': lambda n: n['text']}
         for i in range(0, 100, 20)
     ],
     #
     # code
     #
             'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
             'css', 'cuda', 'dart', 'dockerfile', 'elixir',
             'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
+            'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
             'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
             'literate-agda', 'literate-coffeescript', 'literate-haskell',
             'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
             'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
             'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
+            'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
             'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
+            'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
             'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
             'yacc', 'zig',
         ]
 #
 # total number of chunks
 #
 dataset = StreamingDataset(
   input_dir='../pretrain-data/',
   item_loader=TokensLoader(block_size=2049),