import gc import sys from datasets import load_dataset from transformers import PreTrainedTokenizerFast from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.processors import TemplateProcessing x = input('Are you sure? [y/N] ') if x not in ('y', 'Y', 'yes'): sys.exit(0) def batch_iterator(): # text dataset = ( load_dataset('saillab/taco-datasets', data_dir=data_dir, split='train') for data_dir in [ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4', 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k', ] ) for d in dataset: for row in d: for n in row: yield row['instruction'] + '\n' + row['input'] + '\n' + row['output'] del dataset gc.collect() # text dataset = ( load_dataset('xu-song/cc100-samples', lang, split='train') for lang in [ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw', 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom', 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur', 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh-Hans', 'zh-Hant', 'zu', ] ) for d in dataset: for row in d['text']: yield row del dataset gc.collect() # code dataset = load_dataset('bigcode/programming-languages-keywords', split='train') for row in dataset: for n in row['keywords']: yield n del dataset gc.collect() # code dataset = ( load_dataset('bigcode/the-stack-smol-xs', lang, split='train', trust_remote_code=True) for lang in [ 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly', 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c', 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp', 'css', 'cuda', 'dart', 'dockerfile', 'elixir', 'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go', 'groovy', 'haskell','html', 'idris', 'isabelle', 'java', 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean', 'literate-agda', 'literate-coffeescript', 'literate-haskell', 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab', 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog', 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext', 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme', 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan', 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex', 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt', 'yacc', 'zig', ] ) for d in dataset: for row in d: yield row['content'] del dataset gc.collect() # text + code dataset = load_dataset('m-a-p/CodeFeedback-Filtered-Instruction', split='train') for row in dataset: yield row['query'] + '\n' + row['answer'] del dataset gc.collect() # math dataset = load_dataset('gair-prox/open-web-math-pro', split='train') for row in dataset: yield row['text'] del dataset gc.collect() # math dataset = load_dataset('ajibawa-2023/Maths-College', split='train') for row in dataset: yield row['instruction'] + '\n' + row['output'] del dataset gc.collect() # math dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train') for row in dataset: yield row['question'] + '\n' + row['answer'] del dataset gc.collect() # emoji dataset = load_dataset('badrex/llm-emoji-dataset', split='train') for row in dataset: yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}' del dataset gc.collect() bpe = BPE(unk_token=None, fuse_unk=False, byte_fallback=False, ignore_merges=True) tokenizer = Tokenizer(bpe) special_tokens = [ '', '', '', '<|im_start|>', '<|im_end|>', 'system', 'user', 'assistant', 'resource', 'tool', 'agent', # tool/function calling '', '', '', '', '', '', '"arguments"', '"name"', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # qa '', '', '', '', '', '', # cot, tot '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', # react '', '', '', '', '', '', '', '', '', '', '', '', # reflection '', '', '', '', '', '', '', '', ] for i in range(2, 25): special_tokens.append(' ' * i) for i in range(128 - len(special_tokens)): special_tokens.append(f'<|reserved_{i}|>') # emoji dataset = load_dataset('badrex/llm-emoji-dataset', split='train') emoji_chars = [row['character'] for row in dataset if len(row['character']) == 1] del dataset # programming languages dataset = load_dataset('Tanvir1337/programming-languages', split='train') programming_languages = [n for row in dataset for n in row['text']] del dataset # programming languages keywords dataset = load_dataset('bigcode/programming-languages-keywords', split='train') code_keywords = [n for row in dataset for n in row['keywords']] del dataset tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) tokenizer.post_processor = TemplateProcessing( single='$A:0', # $A represents the token, :0 specifies the type ID for single sequences pair='$A:0 $B:1', # For pairs, we specify type IDs for both tokens special_tokens=[], ) tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True) trainer = BpeTrainer( vocab_size=38400, # 32768 chars + 5034 emojis min_frequency=2, special_tokens=special_tokens, initial_alphabet=emoji_chars + programming_languages + code_keywords, ) tokenizer.train_from_iterator(batch_iterator(), trainer) tokenizer.save('../tokenizer.json') tokenizer.model.save('../') CHATML_CHAT_TEMPLATE = ( "{% for message in messages %}" "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" "{% endfor %}" "{% if add_generation_prompt %}" "{{ '<|im_start|>assistant\n' }}" "{% endif %}" ) fast_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, chat_template=CHATML_CHAT_TEMPLATE, bos_token='', eos_token='', unk_token='', pad_token='', clean_up_tokenization_spaces=False, ) fast_tokenizer.save_pretrained('../')