initial version

Browse files

Files changed (13) hide show

config.json +22 -0
merges.txt +0 -0
misc/logo.png +3 -0
scripts/TRAIN.md +64 -0
scripts/prepare_contrain_dataset.py +31 -0
scripts/prepare_pretrain_dataset.py +147 -0
scripts/pretrain-model.yaml +150 -0
scripts/requirements.in +12 -0
scripts/train_tokenizer.py +337 -0
special_tokens_map.json +6 -0
tokenizer.json +0 -0
tokenizer_config.json +1052 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_name_or_path": "tangledgroup/tangled-llama-33m-32k-base-v0.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_size": 1024,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 38400,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 5,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "vocab_size": 38400
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

misc/logo.png ADDED Viewed

Git LFS Details

SHA256: 6db9ef3985f85ad849bd3a1051e9dadb7703dfa1fefbe50f53e0d935e10ab27b
Pointer size: 132 Bytes
Size of remote file: 1.8 MB

scripts/TRAIN.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# Train
+## Environment
+```bash
+cd scripts
+python -m venv venv
+source venv/bin/activate
+pip install -U -r requirements.in
+```
+## Tokenizer
+```bash
+python -B train_tokenizer.py
+```
+## Dataset
+```bash
+python -B prepare_pretrain_dataset.py
+```
+## Model
+### Pretrain
+```bash
+litgpt pretrain --config ./pretrain-model.yaml
+```
+```bash
+litgpt convert_from_litgpt out/pretrain/final/ out/converted_model
+cp config.json out/pretrain/final/
+cp config.json out/converted_model/
+```
+```python
+import torch
+from safetensors.torch import save_file
+state_dict = torch.load('out/converted_model/model.pth', map_location='cpu')
+save_file(state_dict, 'out/converted_model/model.safetensors')
+```
+## Evaluate
+```bash
+litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
+```

scripts/prepare_contrain_dataset.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+# https://huggingface.co/datasets/Tongjilibo/self_cognition
+https://huggingface.co/datasets/HuggingFaceH4/no_robots
+https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
+https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
+https://huggingface.co/datasets/Locutusque/function-calling-chatml
+https://huggingface.co/datasets/cognitivecomputations/SystemChat-2.0
+https://huggingface.co/datasets/teknium/OpenHermes-2.5
+https://huggingface.co/datasets/cognitivecomputations/open-instruct-uncensored
+https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
+https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft?row=0
+https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
+https://huggingface.co/datasets/Undi95/andrijdavid_roleplay-conversation-sharegpt
+https://huggingface.co/datasets/roleplay4fun/CoupleRP
+https://huggingface.co/datasets/arcee-ai/EvolKit-20k
+https://huggingface.co/datasets/arcee-ai/The-Tome
+https://huggingface.co/datasets/arcee-ai/agent-data
+https://huggingface.co/datasets/arcee-ai/reasoning-sharegpt
+https://huggingface.co/datasets/arcee-ai/infini-instruct-top-500k
+https://huggingface.co/datasets/arcee-ai/BAAI-Infinity-Instruct-System
+# https://huggingface.co/datasets/arcee-ai/financial-instructions-cleaned-2
+https://huggingface.co/datasets/KingNish/reasoning-base-20k
+https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
+https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
+https://huggingface.co/datasets/thesven/gsm8k-reasoning
+"""

scripts/prepare_pretrain_dataset.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from typing import Optional
+from functools import partial
+from datasets import load_dataset
+from litdata import optimize, TokensLoader
+from litgpt.tokenizer import Tokenizer
+def batch_iterator(path: str,
+                   name: Optional[str]=None,
+                   data_dir: Optional[str]=None,
+                   data_files: Optional[str]=None,
+                   revision: Optional[str]=None,
+                   split: str='train',
+                   format: Optional[str]=None):
+    assert format is not None
+    dataset = load_dataset(path=path,
+                           name=name,
+                           data_dir=data_dir,
+                           data_files=data_files,
+                           revision=revision,
+                           split=split,
+                           trust_remote_code=True)
+    for row in dataset:
+        text = format.format(**row)
+        yield text
+def tokenize_fn(datasets_config, tokenizer=None):
+    for text in batch_iterator(**datasets_config):
+        text_ids = tokenizer.encode(text, bos=False, eos=True)
+        yield text_ids
+datasets_configs = [
+    # instruct
+    {'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'},
+    {'path': 'gbharti/wealth-alpaca_lora', 'format': '{instruction} {input} {output}'},
+    {'path': 'databricks/databricks-dolly-15k', 'format': '{instruction} {context} {response}'},
+    {'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
+    # multilingual
+    *[
+        {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
+        for data_dir in [
+            'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
+            'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
+        ]
+    ],
+    *[
+        {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': '{text}'}
+        for name in [
+            'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
+            'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
+            'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
+            'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
+            'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
+            'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
+            'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
+            'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
+            'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
+            'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
+            'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
+            'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
+            'zh-Hans', 'zh-Hant', 'zu',
+        ]
+    ],
+    # *[
+    #     {'path': 'Salesforce/wikitext', 'name': name, 'split': 'train+validation+test', 'format': '{text}'}
+    #     for name in [
+    #         'wikitext-103-raw-v1',
+    #         'wikitext-103-v1',
+    #         'wikitext-2-raw-v1',
+    #         'wikitext-2-v1',
+    #     ]
+    # ],
+    {'path': 'jordiclive/wikipedia-summary-dataset', 'format': '{summary}'},
+    # {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
+    # general
+    # {'path': 'MuskumPillerum/General-Knowledge', 'format': '{Question} {Answer}'},
+    # {'path': 'yirenc/general_knowledge_boolean', 'split': 'train+validation', 'format': '{question}? {answer}. {passage}'},
+    # {'path': 'nuvocare/MSD_instruct', 'split': 'train+test', 'format': '{Question} {Text}'},
+    # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
+    # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
+    # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
+    # code
+    {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
+    *[
+        {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
+        for name in [
+            'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
+            'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
+            'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
+            'css', 'cuda', 'dart', 'dockerfile', 'elixir',
+            'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
+            'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
+            'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
+            'literate-agda', 'literate-coffeescript', 'literate-haskell',
+            'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
+            'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
+            'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
+            'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
+            'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
+            'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
+            'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
+            'yacc', 'zig',
+        ]
+    ],
+    {'path': 'm-a-p/CodeFeedback-Filtered-Instruction', 'split': 'train', 'format': '{query} {answer}'},
+    {'path': 'jtatman/python-code-dataset-500k', 'format': '{instruction} {output}'},
+    {'path': 'iamtarun/python_code_instructions_18k_alpaca', 'format': '{instruction} {input} {output}'},
+    {'path': 'HuggingFaceH4/CodeAlpaca_20K', 'split': 'train+test', 'format': '{prompt} {completion}'},
+    {'path': 'cognitivecomputations/dolphin-coder', 'split': 'train', 'format': '{question} {response}'},
+    # math
+    {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
+    {'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': '{text}'},
+    {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
+    {'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
+    {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
+    {'path': 'meta-math/MetaMathQA', 'format': '{query} {response}'},
+    {'path': 'TIGER-Lab/MathInstruct', 'format': '{instruction} {output}'},
+    {'path': 'TIGER-Lab/WebInstructSub', 'format': '{question} {answer}'},
+    # reasoning
+    {'path': 'SkunkworksAI/reasoning-0.01', 'format': '{instruction} {reasoning} {output}'},
+    {'path': 'KingNish/reasoning-base-20k', 'format': '{user} {reasoning} {assistant}'},
+    {'path': 'Magpie-Align/Magpie-Reasoning-150K', 'format': '{instruction} {response}'},
+    {'path': 'thesven/gsm8k-reasoning', 'format': '{question} {generation} {answer} {short_answer}'},
+    {'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'format': '{informal_statement} {informal_proof} {formal_proof}'},
+    # misc
+    {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'},
+]
+outputs = optimize(
+    fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
+    inputs=datasets_configs,
+    output_dir='../pretrain-data/',
+    # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
+    chunk_size=(2049 * 8012),
+    num_workers=32,
+)

scripts/pretrain-model.yaml ADDED Viewed

	@@ -0,0 +1,150 @@

+# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
+# ``model_config``. (type: Optional[str], default: null)
+model_name: "Llama-3.2-1B"
+# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
+# ``model_config``. (type: Optional[Config], default: null)
+model_config:
+  padded_vocab_size: 38400
+  vocab_size: 38400
+  block_size: 8192
+  n_layer: 5
+  n_head: 32
+  head_size: null
+  n_embd: 1024
+  n_query_groups: 8
+  rotary_percentage: 1.0
+  parallel_residual: false
+  bias: false
+  norm_class_name: "RMSNorm"
+  norm_eps: 1e-05
+  mlp_class_name: "LLaMAMLP"
+  intermediate_size: 4096
+  rope_base: 500000
+  # rope_adjustments:
+  #   factor: 32.0
+  #   low_freq_factor: 1.0
+  #   high_freq_factor: 4.0
+  #   original_max_seq_len: 8192
+# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
+# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
+out_dir: "../out/pretrain/"
+# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+# precision: bf16-mixed
+precision: bf16-true
+# Optional path to a checkpoint directory to initialize the model from.
+# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
+initial_checkpoint_dir:
+# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
+# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
+# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
+# (type: Union[bool, Literal["auto"], Path], default: False)
+# resume: false
+resume: "auto"
+# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
+data:
+  class_path: LitData
+  init_args:
+    data_path: "../pretrain-data/"
+    num_workers: 16
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
+  global_batch_size: 512
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  # micro_batch_size: 16
+  micro_batch_size: 4
+  # Number of iterations with learning rate warmup active (type: int, default: 2000)
+  lr_warmup_steps: 2000
+  # Number of epochs to train on (type: Optional[int], default: null)
+  epochs:
+  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
+  # max_tokens: 3000000000000
+  # max_tokens: 8159107755 # 796399 * 2049 * 5
+  max_tokens: 11422750857 # 796399 * 2049 * 7
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length:
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
+  tie_embeddings:
+  #   (type: Optional[float], default: 1.0)
+  max_norm: 1.0
+  #   (type: float, default: 4e-05)
+  min_lr: 1e-4
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+  # Number of optimizer steps between evaluation calls (type: int, default: 1000)
+  interval: 100
+  # Number of tokens to generate (type: Optional[int], default: null)
+  max_new_tokens:
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+# Optimizer-related arguments
+optimizer:
+  # class_path: torch.optim.AdamW
+  class_path: grokadamw.GrokAdamW
+  # class_path: bitsandbytes.optim.AdamW8bit
+  # class_path: bitsandbytes.optim.PagedAdamW8bit
+  init_args:
+    #   (type: float, default: 0.001)
+    # lr: 1e-3
+    lr: 1e-4
+    #   (type: float, default: 0.01)
+    # weight_decay: 0.01
+    weight_decay: 0.1
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
+# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
+devices: auto
+# How many nodes to use. (type: int, default: 1)
+num_nodes: 1
+# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
+# module require this. (type: Optional[Path], default: null)
+tokenizer_dir: "../"
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
+logger_name: "wandb"
+# The random seed to use for reproducibility. (type: int, default: 42)
+seed: 42

scripts/requirements.in ADDED Viewed

	@@ -0,0 +1,12 @@

+# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+tqdm
+datasets
+jinja2
+transformers
+wandb
+# litgpt[all]
+litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
+litdata
+grokadamw
+# bitsandbytes

scripts/train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import gc
+import sys
+from datasets import load_dataset
+from transformers import PreTrainedTokenizerFast
+from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.processors import TemplateProcessing
+x = input('Are you sure? [y/N] ')
+if x not in ('y', 'Y', 'yes'):
+    sys.exit(0)
+def batch_iterator():
+    # text
+    dataset = (
+        load_dataset('saillab/taco-datasets', data_dir=data_dir, split='train')
+        for data_dir in [
+            'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
+            'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
+        ]
+    )
+    for d in dataset:
+        for row in d:
+            for n in row:
+                yield row['instruction'] + '\n' + row['input'] + '\n' + row['output']
+    del dataset
+    gc.collect()
+    # text
+    dataset = (
+        load_dataset('xu-song/cc100-samples', lang, split='train')
+        for lang in [
+            'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
+            'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
+            'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
+            'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
+            'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
+            'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
+            'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
+            'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
+            'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
+            'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
+            'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
+            'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
+            'zh-Hans', 'zh-Hant', 'zu',
+        ]
+    )
+    for d in dataset:
+        for row in d['text']:
+            yield row
+    del dataset
+    gc.collect()
+    # code
+    dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
+    for row in dataset:
+        for n in row['keywords']:
+            yield n
+    del dataset
+    gc.collect()
+    # code
+    dataset = (
+        load_dataset('bigcode/the-stack-smol-xs', lang, split='train', trust_remote_code=True)
+        for lang in [
+            'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
+            'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
+            'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
+            'css', 'cuda', 'dart', 'dockerfile', 'elixir',
+            'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
+            'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
+            'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
+            'literate-agda', 'literate-coffeescript', 'literate-haskell',
+            'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
+            'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
+            'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
+            'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
+            'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
+            'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
+            'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
+            'yacc', 'zig',
+        ]
+    )
+    for d in dataset:
+        for row in d:
+            yield row['content']
+    del dataset
+    gc.collect()
+    # text + code
+    dataset = load_dataset('m-a-p/CodeFeedback-Filtered-Instruction', split='train')
+    for row in dataset:
+        yield row['query'] + '\n' + row['answer']
+    del dataset
+    gc.collect()
+    # math
+    dataset = load_dataset('gair-prox/open-web-math-pro', split='train')
+    for row in dataset:
+        yield row['text']
+    del dataset
+    gc.collect()
+    # math
+    dataset = load_dataset('ajibawa-2023/Maths-College', split='train')
+    for row in dataset:
+        yield row['instruction'] + '\n' + row['output']
+    del dataset
+    gc.collect()
+    # math
+    dataset = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
+    for row in dataset:
+        yield row['question'] + '\n' + row['answer']
+    del dataset
+    gc.collect()
+    # emoji
+    dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
+    for row in dataset:
+        yield f'{row["character"]}\n{row["unicode"]}\n{row["short description"]}\n{row["tags"]}\n{row["LLM description"]}'
+    del dataset
+    gc.collect()
+bpe = BPE(unk_token=None, fuse_unk=False, byte_fallback=False, ignore_merges=True)
+tokenizer = Tokenizer(bpe)
+special_tokens = [
+    '<unk>',
+    '<s>',
+    '</s>',
+    '<|im_start|>',
+    '<|im_end|>',
+    'system',
+    'user',
+    'assistant',
+    'resource',
+    'tool',
+    'agent',
+    # tool/function calling
+    '<tools>',
+    '</tools>',
+    '<tool_call>',
+    '</tool_call>',
+    '<tool_response>',
+    '</tool_response>',
+    '"arguments"',
+    '"name"',
+    '<arguments>',
+    '</arguments>',
+    '<argument>',
+    '</argument>',
+    '<argument-name>',
+    '</argument-name>',
+    '<argument-type>',
+    '</argument-type>',
+    '<argument-value>',
+    '</argument-value>',
+    '<parameter>',
+    '</parameter>',
+    '<parameter-name>',
+    '</parameter-name>',
+    '<parameter-type>',
+    '</parameter-type>',
+    '<parameter-value>',
+    '</parameter-value>',
+    '<field>',
+    '</field>',
+    '<field-name>',
+    '</field-name>',
+    '<field-type>',
+    '</field-type>',
+    '<field-value>',
+    '</field-value>',
+    '<name>',
+    '</name>',
+    '<type>',
+    '</type>',
+    '<value>',
+    '</value>',
+    '<function>',
+    '</function>',
+    '<function-name>',
+    '</function-name>',
+    '<function-type>',
+    '</function-type>',
+    '<function-value>',
+    '</function-value>',
+    # qa
+    '<qa>',
+    '</qa>',
+    '<question>',
+    '</question>',
+    '<answer>',
+    '</answer>',
+    # cot, tot
+    '<cot>',
+    '</cot>',
+    '<tot>',
+    '</tot>',
+    '<input>',
+    '</input>',
+    '<output>',
+    '</output>',
+    '<thoughts>',
+    '</thoughts>',
+    '<thought>',
+    '</thought>',
+    '<plans>',
+    '</plans>',
+    '<plan>',
+    '</plan>',
+    '<votes>',
+    '</votes>',
+    '<vote>',
+    '</vote>',
+    '<passages>',
+    '</passages>',
+    '<passage>',
+    '</passage>',
+    # react
+    '<react>',
+    '</react>',
+    '<reasoning>',
+    '</reasoning>',
+    '<acting>',
+    '</acting>',
+    '<action>',
+    '</action>',
+    '<observation>',
+    '</observation>',
+    '<claim>',
+    '</claim>',
+    # reflection
+    '<thinking>',
+    '</thinking>',
+    '<step>',
+    '</step>',
+    '<reflection>',
+    '</reflection>',
+    '<output>',
+    '</output>',
+]
+for i in range(2, 25):
+    special_tokens.append(' ' * i)
+for i in range(128 - len(special_tokens)):
+    special_tokens.append(f'<|reserved_{i}|>')
+# emoji
+dataset = load_dataset('badrex/llm-emoji-dataset', split='train')
+emoji_chars = [row['character'] for row in dataset if len(row['character']) == 1]
+del dataset
+# programming languages
+dataset = load_dataset('Tanvir1337/programming-languages', split='train')
+programming_languages = [n for row in dataset for n in row['text']]
+del dataset
+# programming languages keywords
+dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
+code_keywords = [n for row in dataset for n in row['keywords']]
+del dataset
+tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
+tokenizer.post_processor = TemplateProcessing(
+    single='$A:0',                              # $A represents the token, :0 specifies the type ID for single sequences
+    pair='$A:0 $B:1',                           # For pairs, we specify type IDs for both tokens
+    special_tokens=[],
+)
+tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
+trainer = BpeTrainer(
+    vocab_size=38400, # 32768 chars + 5034 emojis
+    min_frequency=2,
+    special_tokens=special_tokens,
+    initial_alphabet=emoji_chars + programming_languages + code_keywords,
+)
+tokenizer.train_from_iterator(batch_iterator(), trainer)
+tokenizer.save('../tokenizer.json')
+tokenizer.model.save('../')
+CHATML_CHAT_TEMPLATE = (
+    "{% for message in messages %}"
+        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
+    "{% endfor %}"
+    "{% if add_generation_prompt %}"
+        "{{ '<|im_start|>assistant\n' }}"
+    "{% endif %}"
+)
+fast_tokenizer = PreTrainedTokenizerFast(
+    tokenizer_object=tokenizer,
+    chat_template=CHATML_CHAT_TEMPLATE,
+    bos_token='<s>',
+    eos_token='</s>',
+    unk_token='<unk>',
+    pad_token='</s>',
+    clean_up_tokenization_spaces=False,
+)
+fast_tokenizer.save_pretrained('../')

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1052 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "system",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "user",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "assistant",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "resource",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "tool",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "agent",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "</tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "\"arguments\"",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "\"name\"",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<arguments>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "</arguments>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<argument>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "</argument>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<argument-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "</argument-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<argument-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "</argument-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<argument-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "</argument-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<parameter>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "</parameter>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<parameter-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "</parameter-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<parameter-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "</parameter-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<parameter-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "</parameter-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "37": {
+      "content": "<field>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "38": {
+      "content": "</field>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39": {
+      "content": "<field-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40": {
+      "content": "</field-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41": {
+      "content": "<field-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42": {
+      "content": "</field-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "43": {
+      "content": "<field-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "44": {
+      "content": "</field-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "<name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46": {
+      "content": "</name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "47": {
+      "content": "<type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "48": {
+      "content": "</type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49": {
+      "content": "<value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50": {
+      "content": "</value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "51": {
+      "content": "<function>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "52": {
+      "content": "</function>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "53": {
+      "content": "<function-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "54": {
+      "content": "</function-name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "55": {
+      "content": "<function-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "56": {
+      "content": "</function-type>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57": {
+      "content": "<function-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "58": {
+      "content": "</function-value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59": {
+      "content": "<qa>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "60": {
+      "content": "</qa>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "61": {
+      "content": "<question>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "62": {
+      "content": "</question>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "63": {
+      "content": "<answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64": {
+      "content": "</answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65": {
+      "content": "<cot>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "66": {
+      "content": "</cot>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "67": {
+      "content": "<tot>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "68": {
+      "content": "</tot>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "69": {
+      "content": "<input>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70": {
+      "content": "</input>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "71": {
+      "content": "<output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "72": {
+      "content": "</output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73": {
+      "content": "<thoughts>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "74": {
+      "content": "</thoughts>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "75": {
+      "content": "<thought>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "76": {
+      "content": "</thought>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "77": {
+      "content": "<plans>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "78": {
+      "content": "</plans>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<plan>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "</plan>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "81": {
+      "content": "<votes>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "82": {
+      "content": "</votes>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "83": {
+      "content": "<vote>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "84": {
+      "content": "</vote>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "85": {
+      "content": "<passages>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "86": {
+      "content": "</passages>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "87": {
+      "content": "<passage>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "88": {
+      "content": "</passage>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "89": {
+      "content": "<react>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "90": {
+      "content": "</react>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "91": {
+      "content": "<reasoning>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92": {
+      "content": "</reasoning>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "93": {
+      "content": "<acting>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "94": {
+      "content": "</acting>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "95": {
+      "content": "<action>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96": {
+      "content": "</action>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "97": {
+      "content": "<observation>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "98": {
+      "content": "</observation>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "99": {
+      "content": "<claim>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "</claim>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "<thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "</thinking>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "<step>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "</step>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "105": {
+      "content": "<reflection>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "</reflection>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "108": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "109": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "111": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "112": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "113": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "114": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "115": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "116": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "117": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "118": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "119": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "120": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "121": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "122": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "123": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "124": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "125": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "126": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "127": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "129": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff