diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dca02710cc1109b7019ef80e5ca6d385733c7d31 --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +license: apache-2.0 +tags: +- generated_from_trainer +model-index: +- name: outputs/lora-out + results: [] +--- + + + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +
See axolotl config + +axolotl version: `0.4.1` +```yaml +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: +val_set_size: 0.05 +output_dir: ./outputs/lora-out + +sequence_len: 4096 +sample_packing: true +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 4 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +evals_per_epoch: 4 +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + +``` + +

+ +# outputs/lora-out + +This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 1.2122 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0002 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 4 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 1.4615 | 0.08 | 1 | 1.4899 | +| 1.3849 | 0.24 | 3 | 1.4852 | +| 1.3665 | 0.48 | 6 | 1.4411 | +| 1.2689 | 0.72 | 9 | 1.3381 | +| 1.2258 | 0.96 | 12 | 1.2960 | +| 1.2518 | 1.16 | 15 | 1.2797 | +| 1.2263 | 1.4 | 18 | 1.2534 | +| 1.1343 | 1.6400 | 21 | 1.2354 | +| 1.2699 | 1.88 | 24 | 1.2255 | +| 1.1493 | 2.08 | 27 | 1.2228 | +| 1.153 | 2.32 | 30 | 1.2188 | +| 1.1947 | 2.56 | 33 | 1.2183 | +| 1.1125 | 2.8 | 36 | 1.2157 | +| 1.1512 | 3.04 | 39 | 1.2123 | +| 1.1883 | 3.24 | 42 | 1.2100 | +| 1.1012 | 3.48 | 45 | 1.2119 | +| 1.1891 | 3.7200 | 48 | 1.2122 | + + +### Framework versions + +- PEFT 0.11.1 +- Transformers 4.41.1 +- Pytorch 2.1.2+cu118 +- Datasets 2.19.1 +- Tokenizers 0.19.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f35e8ab83cadcf66049c27173cb1262d730192 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "k_proj", + "up_proj", + "down_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..03f5ccc91c5182bcaa1c35eca4474051f9f72a5d --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83fb3b79b5ba0461f172a5ba42b454e475a89f340302d0e7290c06fafb74bb5e +size 101036698 diff --git a/checkpoint-12/README.md b/checkpoint-12/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-12/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-12/adapter_config.json b/checkpoint-12/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f35e8ab83cadcf66049c27173cb1262d730192 --- /dev/null +++ b/checkpoint-12/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "k_proj", + "up_proj", + "down_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-12/adapter_model.safetensors b/checkpoint-12/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4fc33ac546eb0a885997d131675a13b1313d217d --- /dev/null +++ b/checkpoint-12/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e78dfab986ee65ea34621a7fcd0838be219cf44ea2ac5d5125cbb1dc459193 +size 100966336 diff --git a/checkpoint-12/optimizer.pt b/checkpoint-12/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b95f215697ac6fb2c903cee8e254ea86ef51cad4 --- /dev/null +++ b/checkpoint-12/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:344a969815604ff51a5d4ced4dd10e3d5b4b6cadb4f6ec63d2d38fcd9ea29224 +size 50916644 diff --git a/checkpoint-12/rng_state.pth b/checkpoint-12/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b44440988c2b2fe773e93b385f27b4ad4f2213b8 --- /dev/null +++ b/checkpoint-12/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1caded59d882f62e038c05e37353b49af7a26d65cdab24f3ce036c72f8a46a4 +size 14244 diff --git a/checkpoint-12/scheduler.pt b/checkpoint-12/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..48748cf6e74350ca0094e19264c545ccf14b4f53 --- /dev/null +++ b/checkpoint-12/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e535a15f440e3b1b4d1872998a3c1d64048b2d54e365eb59e3aa3a5899e46b5 +size 1064 diff --git a/checkpoint-12/special_tokens_map.json b/checkpoint-12/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-12/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-12/tokenizer.model b/checkpoint-12/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-12/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-12/tokenizer_config.json b/checkpoint-12/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-12/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-12/trainer_state.json b/checkpoint-12/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a98f6113ad822198a96ce9809cabdc43d73b5d10 --- /dev/null +++ b/checkpoint-12/trainer_state.json @@ -0,0 +1,157 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.96, + "eval_steps": 3, + "global_step": 12, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.1656520962715149, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 29.186, + "eval_samples_per_second": 3.426, + "eval_steps_per_second": 1.713, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.1882542371749878, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.15945176780223846, + "learning_rate": 6e-05, + "loss": 1.3849, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.485183835029602, + "eval_runtime": 29.3484, + "eval_samples_per_second": 3.407, + "eval_steps_per_second": 1.704, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.13675835728645325, + "learning_rate": 8e-05, + "loss": 1.2212, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.1532098948955536, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16159594058990479, + "learning_rate": 0.00012, + "loss": 1.3665, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.441084623336792, + "eval_runtime": 29.4785, + "eval_samples_per_second": 3.392, + "eval_steps_per_second": 1.696, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.1462002545595169, + "learning_rate": 0.00014, + "loss": 1.3003, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13418763875961304, + "learning_rate": 0.00016, + "loss": 1.3331, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.10984567552804947, + "learning_rate": 0.00018, + "loss": 1.2689, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3380621671676636, + "eval_runtime": 29.5305, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 1.693, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.10075916349887848, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11774784326553345, + "learning_rate": 0.000199658449300667, + "loss": 1.2936, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.10691336542367935, + "learning_rate": 0.00019863613034027224, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.296021580696106, + "eval_runtime": 29.6291, + "eval_samples_per_second": 3.375, + "eval_steps_per_second": 1.688, + "step": 12 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2500249176244224.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12/training_args.bin b/checkpoint-12/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8d1abf3674a169f158217fe8beb0d6440c93b7ed --- /dev/null +++ b/checkpoint-12/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9de536148fb50b99c1a1c0fda0b62700ebddf19e423baa229b84d9504c8284b +size 5944 diff --git a/checkpoint-24/README.md b/checkpoint-24/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-24/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-24/adapter_config.json b/checkpoint-24/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f35e8ab83cadcf66049c27173cb1262d730192 --- /dev/null +++ b/checkpoint-24/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "k_proj", + "up_proj", + "down_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-24/adapter_model.safetensors b/checkpoint-24/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d99b0265d768c96244874ff98ac77e8aa293608f --- /dev/null +++ b/checkpoint-24/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67420864913bf0f42a9ef48086ce13937732242dbc9aa876b5b44b3475b92eb9 +size 100966336 diff --git a/checkpoint-24/optimizer.pt b/checkpoint-24/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dd31e672b8a7081f47f577e30cfbe868b497df8 --- /dev/null +++ b/checkpoint-24/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e88e5c37837daf2c65be357c477b98f9858c11ed8f5796b4210b8c555b5410 +size 50916644 diff --git a/checkpoint-24/rng_state.pth b/checkpoint-24/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..265aa2e542c0b7ad0d243a0d3d3d3c49234c3476 --- /dev/null +++ b/checkpoint-24/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee5d7712ead1407778ee7b5133124a5e5000c60cc08598d67c309c3e44dcfca +size 14244 diff --git a/checkpoint-24/scheduler.pt b/checkpoint-24/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..202deb39034de2c553e2527f4d7a38b34e0361a4 --- /dev/null +++ b/checkpoint-24/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd2df94f244c74ef9128181bbcabe340233f441f19aa2de60f32d36a56a9cac +size 1064 diff --git a/checkpoint-24/special_tokens_map.json b/checkpoint-24/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-24/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-24/tokenizer.model b/checkpoint-24/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-24/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-24/tokenizer_config.json b/checkpoint-24/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-24/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-24/trainer_state.json b/checkpoint-24/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bdb49ba54e7f63c537853b4be882ae96b4b4d13f --- /dev/null +++ b/checkpoint-24/trainer_state.json @@ -0,0 +1,273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.88, + "eval_steps": 3, + "global_step": 24, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.1656520962715149, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 29.186, + "eval_samples_per_second": 3.426, + "eval_steps_per_second": 1.713, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.1882542371749878, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.15945176780223846, + "learning_rate": 6e-05, + "loss": 1.3849, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.485183835029602, + "eval_runtime": 29.3484, + "eval_samples_per_second": 3.407, + "eval_steps_per_second": 1.704, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.13675835728645325, + "learning_rate": 8e-05, + "loss": 1.2212, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.1532098948955536, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16159594058990479, + "learning_rate": 0.00012, + "loss": 1.3665, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.441084623336792, + "eval_runtime": 29.4785, + "eval_samples_per_second": 3.392, + "eval_steps_per_second": 1.696, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.1462002545595169, + "learning_rate": 0.00014, + "loss": 1.3003, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13418763875961304, + "learning_rate": 0.00016, + "loss": 1.3331, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.10984567552804947, + "learning_rate": 0.00018, + "loss": 1.2689, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3380621671676636, + "eval_runtime": 29.5305, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 1.693, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.10075916349887848, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11774784326553345, + "learning_rate": 0.000199658449300667, + "loss": 1.2936, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.10691336542367935, + "learning_rate": 0.00019863613034027224, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.296021580696106, + "eval_runtime": 29.6291, + "eval_samples_per_second": 3.375, + "eval_steps_per_second": 1.688, + "step": 12 + }, + { + "epoch": 1.04, + "grad_norm": 0.10832437872886658, + "learning_rate": 0.00019694002659393305, + "loss": 1.2647, + "step": 13 + }, + { + "epoch": 1.08, + "grad_norm": 0.11240752041339874, + "learning_rate": 0.00019458172417006347, + "loss": 1.2595, + "step": 14 + }, + { + "epoch": 1.16, + "grad_norm": 0.10769112408161163, + "learning_rate": 0.00019157733266550575, + "loss": 1.2518, + "step": 15 + }, + { + "epoch": 1.16, + "eval_loss": 1.279650092124939, + "eval_runtime": 29.6168, + "eval_samples_per_second": 3.376, + "eval_steps_per_second": 1.688, + "step": 15 + }, + { + "epoch": 1.24, + "grad_norm": 0.09908384829759598, + "learning_rate": 0.0001879473751206489, + "loss": 1.1644, + "step": 16 + }, + { + "epoch": 1.32, + "grad_norm": 0.09107685834169388, + "learning_rate": 0.00018371664782625287, + "loss": 1.1601, + "step": 17 + }, + { + "epoch": 1.4, + "grad_norm": 0.09361294656991959, + "learning_rate": 0.00017891405093963938, + "loss": 1.2263, + "step": 18 + }, + { + "epoch": 1.4, + "eval_loss": 1.2533847093582153, + "eval_runtime": 29.5676, + "eval_samples_per_second": 3.382, + "eval_steps_per_second": 1.691, + "step": 18 + }, + { + "epoch": 1.48, + "grad_norm": 0.0980026125907898, + "learning_rate": 0.00017357239106731317, + "loss": 1.2272, + "step": 19 + }, + { + "epoch": 1.56, + "grad_norm": 0.07957063615322113, + "learning_rate": 0.00016772815716257412, + "loss": 1.1913, + "step": 20 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07226990163326263, + "learning_rate": 0.0001614212712689668, + "loss": 1.1343, + "step": 21 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 1.2354038953781128, + "eval_runtime": 29.5859, + "eval_samples_per_second": 3.38, + "eval_steps_per_second": 1.69, + "step": 21 + }, + { + "epoch": 1.72, + "grad_norm": 0.0797078013420105, + "learning_rate": 0.00015469481581224272, + "loss": 1.202, + "step": 22 + }, + { + "epoch": 1.8, + "grad_norm": 0.0746772438287735, + "learning_rate": 0.00014759473930370736, + "loss": 1.2479, + "step": 23 + }, + { + "epoch": 1.88, + "grad_norm": 0.07190073281526566, + "learning_rate": 0.00014016954246529696, + "loss": 1.2699, + "step": 24 + }, + { + "epoch": 1.88, + "eval_loss": 1.2254745960235596, + "eval_runtime": 29.6412, + "eval_samples_per_second": 3.374, + "eval_steps_per_second": 1.687, + "step": 24 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5000498352488448.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24/training_args.bin b/checkpoint-24/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8d1abf3674a169f158217fe8beb0d6440c93b7ed --- /dev/null +++ b/checkpoint-24/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9de536148fb50b99c1a1c0fda0b62700ebddf19e423baa229b84d9504c8284b +size 5944 diff --git a/checkpoint-36/README.md b/checkpoint-36/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-36/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-36/adapter_config.json b/checkpoint-36/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f35e8ab83cadcf66049c27173cb1262d730192 --- /dev/null +++ b/checkpoint-36/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "k_proj", + "up_proj", + "down_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-36/adapter_model.safetensors b/checkpoint-36/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..486c933d455f806af6b64b09850a0aba84396a97 --- /dev/null +++ b/checkpoint-36/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38374a06443c41c2707200afb95d60bed35df189dfabb4b3f4d5069a835b11b5 +size 100966336 diff --git a/checkpoint-36/optimizer.pt b/checkpoint-36/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..73142bc6b1c3b9e748a1f21533111aa24906a9ce --- /dev/null +++ b/checkpoint-36/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76b8fa38588655c2dd93cda7a9b5efba7d4891351dcba2ced236ae0fabe383f0 +size 50916644 diff --git a/checkpoint-36/rng_state.pth b/checkpoint-36/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cadd3b0d36c693857ced847205e4324232bdcd7f --- /dev/null +++ b/checkpoint-36/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9406ba9d38dc24285cb7340ffc34019dd1a7aa57153373925ee7805ef0137aae +size 14244 diff --git a/checkpoint-36/scheduler.pt b/checkpoint-36/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c709adb50c980617bac33c11cf5ae0931260631d --- /dev/null +++ b/checkpoint-36/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5918ddd95097cd0d9acd73ea2bf14c23b23f8d6e0bb73e5c46156ea038bd743 +size 1064 diff --git a/checkpoint-36/special_tokens_map.json b/checkpoint-36/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-36/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-36/tokenizer.model b/checkpoint-36/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-36/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-36/tokenizer_config.json b/checkpoint-36/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-36/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-36/trainer_state.json b/checkpoint-36/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..278a02a4248763fe028b5bb76c33de98dc07152f --- /dev/null +++ b/checkpoint-36/trainer_state.json @@ -0,0 +1,389 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8, + "eval_steps": 3, + "global_step": 36, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.1656520962715149, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 29.186, + "eval_samples_per_second": 3.426, + "eval_steps_per_second": 1.713, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.1882542371749878, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.15945176780223846, + "learning_rate": 6e-05, + "loss": 1.3849, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.485183835029602, + "eval_runtime": 29.3484, + "eval_samples_per_second": 3.407, + "eval_steps_per_second": 1.704, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.13675835728645325, + "learning_rate": 8e-05, + "loss": 1.2212, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.1532098948955536, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16159594058990479, + "learning_rate": 0.00012, + "loss": 1.3665, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.441084623336792, + "eval_runtime": 29.4785, + "eval_samples_per_second": 3.392, + "eval_steps_per_second": 1.696, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.1462002545595169, + "learning_rate": 0.00014, + "loss": 1.3003, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13418763875961304, + "learning_rate": 0.00016, + "loss": 1.3331, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.10984567552804947, + "learning_rate": 0.00018, + "loss": 1.2689, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3380621671676636, + "eval_runtime": 29.5305, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 1.693, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.10075916349887848, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11774784326553345, + "learning_rate": 0.000199658449300667, + "loss": 1.2936, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.10691336542367935, + "learning_rate": 0.00019863613034027224, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.296021580696106, + "eval_runtime": 29.6291, + "eval_samples_per_second": 3.375, + "eval_steps_per_second": 1.688, + "step": 12 + }, + { + "epoch": 1.04, + "grad_norm": 0.10832437872886658, + "learning_rate": 0.00019694002659393305, + "loss": 1.2647, + "step": 13 + }, + { + "epoch": 1.08, + "grad_norm": 0.11240752041339874, + "learning_rate": 0.00019458172417006347, + "loss": 1.2595, + "step": 14 + }, + { + "epoch": 1.16, + "grad_norm": 0.10769112408161163, + "learning_rate": 0.00019157733266550575, + "loss": 1.2518, + "step": 15 + }, + { + "epoch": 1.16, + "eval_loss": 1.279650092124939, + "eval_runtime": 29.6168, + "eval_samples_per_second": 3.376, + "eval_steps_per_second": 1.688, + "step": 15 + }, + { + "epoch": 1.24, + "grad_norm": 0.09908384829759598, + "learning_rate": 0.0001879473751206489, + "loss": 1.1644, + "step": 16 + }, + { + "epoch": 1.32, + "grad_norm": 0.09107685834169388, + "learning_rate": 0.00018371664782625287, + "loss": 1.1601, + "step": 17 + }, + { + "epoch": 1.4, + "grad_norm": 0.09361294656991959, + "learning_rate": 0.00017891405093963938, + "loss": 1.2263, + "step": 18 + }, + { + "epoch": 1.4, + "eval_loss": 1.2533847093582153, + "eval_runtime": 29.5676, + "eval_samples_per_second": 3.382, + "eval_steps_per_second": 1.691, + "step": 18 + }, + { + "epoch": 1.48, + "grad_norm": 0.0980026125907898, + "learning_rate": 0.00017357239106731317, + "loss": 1.2272, + "step": 19 + }, + { + "epoch": 1.56, + "grad_norm": 0.07957063615322113, + "learning_rate": 0.00016772815716257412, + "loss": 1.1913, + "step": 20 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07226990163326263, + "learning_rate": 0.0001614212712689668, + "loss": 1.1343, + "step": 21 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 1.2354038953781128, + "eval_runtime": 29.5859, + "eval_samples_per_second": 3.38, + "eval_steps_per_second": 1.69, + "step": 21 + }, + { + "epoch": 1.72, + "grad_norm": 0.0797078013420105, + "learning_rate": 0.00015469481581224272, + "loss": 1.202, + "step": 22 + }, + { + "epoch": 1.8, + "grad_norm": 0.0746772438287735, + "learning_rate": 0.00014759473930370736, + "loss": 1.2479, + "step": 23 + }, + { + "epoch": 1.88, + "grad_norm": 0.07190073281526566, + "learning_rate": 0.00014016954246529696, + "loss": 1.2699, + "step": 24 + }, + { + "epoch": 1.88, + "eval_loss": 1.2254745960235596, + "eval_runtime": 29.6412, + "eval_samples_per_second": 3.374, + "eval_steps_per_second": 1.687, + "step": 24 + }, + { + "epoch": 1.96, + "grad_norm": 0.06926661729812622, + "learning_rate": 0.00013246994692046836, + "loss": 1.2042, + "step": 25 + }, + { + "epoch": 2.04, + "grad_norm": 0.07788683474063873, + "learning_rate": 0.00012454854871407994, + "loss": 1.1925, + "step": 26 + }, + { + "epoch": 2.08, + "grad_norm": 0.06513918191194534, + "learning_rate": 0.00011645945902807341, + "loss": 1.1493, + "step": 27 + }, + { + "epoch": 2.08, + "eval_loss": 1.2227890491485596, + "eval_runtime": 29.6808, + "eval_samples_per_second": 3.369, + "eval_steps_per_second": 1.685, + "step": 27 + }, + { + "epoch": 2.16, + "grad_norm": 0.07514671981334686, + "learning_rate": 0.00010825793454723325, + "loss": 1.1685, + "step": 28 + }, + { + "epoch": 2.24, + "grad_norm": 0.06782150268554688, + "learning_rate": 0.0001, + "loss": 1.2049, + "step": 29 + }, + { + "epoch": 2.32, + "grad_norm": 0.06837104260921478, + "learning_rate": 9.174206545276677e-05, + "loss": 1.153, + "step": 30 + }, + { + "epoch": 2.32, + "eval_loss": 1.2187583446502686, + "eval_runtime": 29.5599, + "eval_samples_per_second": 3.383, + "eval_steps_per_second": 1.691, + "step": 30 + }, + { + "epoch": 2.4, + "grad_norm": 0.0675550326704979, + "learning_rate": 8.35405409719266e-05, + "loss": 1.1826, + "step": 31 + }, + { + "epoch": 2.48, + "grad_norm": 0.06812074780464172, + "learning_rate": 7.54514512859201e-05, + "loss": 1.2106, + "step": 32 + }, + { + "epoch": 2.56, + "grad_norm": 0.06854978948831558, + "learning_rate": 6.753005307953167e-05, + "loss": 1.1947, + "step": 33 + }, + { + "epoch": 2.56, + "eval_loss": 1.2183395624160767, + "eval_runtime": 29.5611, + "eval_samples_per_second": 3.383, + "eval_steps_per_second": 1.691, + "step": 33 + }, + { + "epoch": 2.64, + "grad_norm": 0.06954147666692734, + "learning_rate": 5.983045753470308e-05, + "loss": 1.1887, + "step": 34 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.07219364494085312, + "learning_rate": 5.240526069629265e-05, + "loss": 1.1784, + "step": 35 + }, + { + "epoch": 2.8, + "grad_norm": 0.07003732025623322, + "learning_rate": 4.530518418775733e-05, + "loss": 1.1125, + "step": 36 + }, + { + "epoch": 2.8, + "eval_loss": 1.2157045602798462, + "eval_runtime": 29.5711, + "eval_samples_per_second": 3.382, + "eval_steps_per_second": 1.691, + "step": 36 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7474703266480128.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-36/training_args.bin b/checkpoint-36/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8d1abf3674a169f158217fe8beb0d6440c93b7ed --- /dev/null +++ b/checkpoint-36/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9de536148fb50b99c1a1c0fda0b62700ebddf19e423baa229b84d9504c8284b +size 5944 diff --git a/checkpoint-48/README.md b/checkpoint-48/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-48/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-48/adapter_config.json b/checkpoint-48/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f35e8ab83cadcf66049c27173cb1262d730192 --- /dev/null +++ b/checkpoint-48/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "k_proj", + "up_proj", + "down_proj", + "gate_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-48/adapter_model.safetensors b/checkpoint-48/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b39c73ab1ad4d03512473209ab24f6cced1abef2 --- /dev/null +++ b/checkpoint-48/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:923d1515f2fb6f18bfcbb0bd39ebfe59466355ed28f31874ca9b4d1cc9dcf394 +size 100966336 diff --git a/checkpoint-48/optimizer.pt b/checkpoint-48/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..70fe88ff5abbfe3cf892a964357803cf68619ce6 --- /dev/null +++ b/checkpoint-48/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dacdc37918f4b44f3347f0408792c3fd0fe2a19685176f0da66530313424968 +size 50916644 diff --git a/checkpoint-48/rng_state.pth b/checkpoint-48/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6b718933d5e5f9f8716b99780c528adfdf0928f --- /dev/null +++ b/checkpoint-48/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b4504d41cd874064ffb537ca9dcd675667dec4f833bc05973da50665fb906b +size 14244 diff --git a/checkpoint-48/scheduler.pt b/checkpoint-48/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c9349de67f87d64b3f06a643104e8f5404a2137 --- /dev/null +++ b/checkpoint-48/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b832e0373d616c3d50894a908dda7ef6c28f6cb2f8a92b6d36348dbf67fd1715 +size 1064 diff --git a/checkpoint-48/special_tokens_map.json b/checkpoint-48/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-48/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-48/tokenizer.model b/checkpoint-48/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-48/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-48/tokenizer_config.json b/checkpoint-48/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-48/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-48/trainer_state.json b/checkpoint-48/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..997e8032bb00a7da00ce599810d09eed96c8b7f4 --- /dev/null +++ b/checkpoint-48/trainer_state.json @@ -0,0 +1,505 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.7199999999999998, + "eval_steps": 3, + "global_step": 48, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.1656520962715149, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 29.186, + "eval_samples_per_second": 3.426, + "eval_steps_per_second": 1.713, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.1882542371749878, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.15945176780223846, + "learning_rate": 6e-05, + "loss": 1.3849, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.485183835029602, + "eval_runtime": 29.3484, + "eval_samples_per_second": 3.407, + "eval_steps_per_second": 1.704, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.13675835728645325, + "learning_rate": 8e-05, + "loss": 1.2212, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.1532098948955536, + "learning_rate": 0.0001, + "loss": 1.3626, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16159594058990479, + "learning_rate": 0.00012, + "loss": 1.3665, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.441084623336792, + "eval_runtime": 29.4785, + "eval_samples_per_second": 3.392, + "eval_steps_per_second": 1.696, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.1462002545595169, + "learning_rate": 0.00014, + "loss": 1.3003, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13418763875961304, + "learning_rate": 0.00016, + "loss": 1.3331, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.10984567552804947, + "learning_rate": 0.00018, + "loss": 1.2689, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3380621671676636, + "eval_runtime": 29.5305, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 1.693, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.10075916349887848, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11774784326553345, + "learning_rate": 0.000199658449300667, + "loss": 1.2936, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.10691336542367935, + "learning_rate": 0.00019863613034027224, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.296021580696106, + "eval_runtime": 29.6291, + "eval_samples_per_second": 3.375, + "eval_steps_per_second": 1.688, + "step": 12 + }, + { + "epoch": 1.04, + "grad_norm": 0.10832437872886658, + "learning_rate": 0.00019694002659393305, + "loss": 1.2647, + "step": 13 + }, + { + "epoch": 1.08, + "grad_norm": 0.11240752041339874, + "learning_rate": 0.00019458172417006347, + "loss": 1.2595, + "step": 14 + }, + { + "epoch": 1.16, + "grad_norm": 0.10769112408161163, + "learning_rate": 0.00019157733266550575, + "loss": 1.2518, + "step": 15 + }, + { + "epoch": 1.16, + "eval_loss": 1.279650092124939, + "eval_runtime": 29.6168, + "eval_samples_per_second": 3.376, + "eval_steps_per_second": 1.688, + "step": 15 + }, + { + "epoch": 1.24, + "grad_norm": 0.09908384829759598, + "learning_rate": 0.0001879473751206489, + "loss": 1.1644, + "step": 16 + }, + { + "epoch": 1.32, + "grad_norm": 0.09107685834169388, + "learning_rate": 0.00018371664782625287, + "loss": 1.1601, + "step": 17 + }, + { + "epoch": 1.4, + "grad_norm": 0.09361294656991959, + "learning_rate": 0.00017891405093963938, + "loss": 1.2263, + "step": 18 + }, + { + "epoch": 1.4, + "eval_loss": 1.2533847093582153, + "eval_runtime": 29.5676, + "eval_samples_per_second": 3.382, + "eval_steps_per_second": 1.691, + "step": 18 + }, + { + "epoch": 1.48, + "grad_norm": 0.0980026125907898, + "learning_rate": 0.00017357239106731317, + "loss": 1.2272, + "step": 19 + }, + { + "epoch": 1.56, + "grad_norm": 0.07957063615322113, + "learning_rate": 0.00016772815716257412, + "loss": 1.1913, + "step": 20 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07226990163326263, + "learning_rate": 0.0001614212712689668, + "loss": 1.1343, + "step": 21 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 1.2354038953781128, + "eval_runtime": 29.5859, + "eval_samples_per_second": 3.38, + "eval_steps_per_second": 1.69, + "step": 21 + }, + { + "epoch": 1.72, + "grad_norm": 0.0797078013420105, + "learning_rate": 0.00015469481581224272, + "loss": 1.202, + "step": 22 + }, + { + "epoch": 1.8, + "grad_norm": 0.0746772438287735, + "learning_rate": 0.00014759473930370736, + "loss": 1.2479, + "step": 23 + }, + { + "epoch": 1.88, + "grad_norm": 0.07190073281526566, + "learning_rate": 0.00014016954246529696, + "loss": 1.2699, + "step": 24 + }, + { + "epoch": 1.88, + "eval_loss": 1.2254745960235596, + "eval_runtime": 29.6412, + "eval_samples_per_second": 3.374, + "eval_steps_per_second": 1.687, + "step": 24 + }, + { + "epoch": 1.96, + "grad_norm": 0.06926661729812622, + "learning_rate": 0.00013246994692046836, + "loss": 1.2042, + "step": 25 + }, + { + "epoch": 2.04, + "grad_norm": 0.07788683474063873, + "learning_rate": 0.00012454854871407994, + "loss": 1.1925, + "step": 26 + }, + { + "epoch": 2.08, + "grad_norm": 0.06513918191194534, + "learning_rate": 0.00011645945902807341, + "loss": 1.1493, + "step": 27 + }, + { + "epoch": 2.08, + "eval_loss": 1.2227890491485596, + "eval_runtime": 29.6808, + "eval_samples_per_second": 3.369, + "eval_steps_per_second": 1.685, + "step": 27 + }, + { + "epoch": 2.16, + "grad_norm": 0.07514671981334686, + "learning_rate": 0.00010825793454723325, + "loss": 1.1685, + "step": 28 + }, + { + "epoch": 2.24, + "grad_norm": 0.06782150268554688, + "learning_rate": 0.0001, + "loss": 1.2049, + "step": 29 + }, + { + "epoch": 2.32, + "grad_norm": 0.06837104260921478, + "learning_rate": 9.174206545276677e-05, + "loss": 1.153, + "step": 30 + }, + { + "epoch": 2.32, + "eval_loss": 1.2187583446502686, + "eval_runtime": 29.5599, + "eval_samples_per_second": 3.383, + "eval_steps_per_second": 1.691, + "step": 30 + }, + { + "epoch": 2.4, + "grad_norm": 0.0675550326704979, + "learning_rate": 8.35405409719266e-05, + "loss": 1.1826, + "step": 31 + }, + { + "epoch": 2.48, + "grad_norm": 0.06812074780464172, + "learning_rate": 7.54514512859201e-05, + "loss": 1.2106, + "step": 32 + }, + { + "epoch": 2.56, + "grad_norm": 0.06854978948831558, + "learning_rate": 6.753005307953167e-05, + "loss": 1.1947, + "step": 33 + }, + { + "epoch": 2.56, + "eval_loss": 1.2183395624160767, + "eval_runtime": 29.5611, + "eval_samples_per_second": 3.383, + "eval_steps_per_second": 1.691, + "step": 33 + }, + { + "epoch": 2.64, + "grad_norm": 0.06954147666692734, + "learning_rate": 5.983045753470308e-05, + "loss": 1.1887, + "step": 34 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.07219364494085312, + "learning_rate": 5.240526069629265e-05, + "loss": 1.1784, + "step": 35 + }, + { + "epoch": 2.8, + "grad_norm": 0.07003732025623322, + "learning_rate": 4.530518418775733e-05, + "loss": 1.1125, + "step": 36 + }, + { + "epoch": 2.8, + "eval_loss": 1.2157045602798462, + "eval_runtime": 29.5711, + "eval_samples_per_second": 3.382, + "eval_steps_per_second": 1.691, + "step": 36 + }, + { + "epoch": 2.88, + "grad_norm": 0.06867608428001404, + "learning_rate": 3.857872873103322e-05, + "loss": 1.1847, + "step": 37 + }, + { + "epoch": 2.96, + "grad_norm": 0.06787115335464478, + "learning_rate": 3.227184283742591e-05, + "loss": 1.1651, + "step": 38 + }, + { + "epoch": 3.04, + "grad_norm": 0.06784378737211227, + "learning_rate": 2.6427608932686843e-05, + "loss": 1.1512, + "step": 39 + }, + { + "epoch": 3.04, + "eval_loss": 1.2122877836227417, + "eval_runtime": 29.5849, + "eval_samples_per_second": 3.38, + "eval_steps_per_second": 1.69, + "step": 39 + }, + { + "epoch": 3.08, + "grad_norm": 0.064244844019413, + "learning_rate": 2.1085949060360654e-05, + "loss": 1.154, + "step": 40 + }, + { + "epoch": 3.16, + "grad_norm": 0.0654948428273201, + "learning_rate": 1.6283352173747145e-05, + "loss": 1.1454, + "step": 41 + }, + { + "epoch": 3.24, + "grad_norm": 0.06797634065151215, + "learning_rate": 1.2052624879351104e-05, + "loss": 1.1883, + "step": 42 + }, + { + "epoch": 3.24, + "eval_loss": 1.209986686706543, + "eval_runtime": 29.54, + "eval_samples_per_second": 3.385, + "eval_steps_per_second": 1.693, + "step": 42 + }, + { + "epoch": 3.32, + "grad_norm": 0.06889563798904419, + "learning_rate": 8.422667334494249e-06, + "loss": 1.174, + "step": 43 + }, + { + "epoch": 3.4, + "grad_norm": 0.06667731702327728, + "learning_rate": 5.418275829936537e-06, + "loss": 1.0968, + "step": 44 + }, + { + "epoch": 3.48, + "grad_norm": 0.0668441653251648, + "learning_rate": 3.059973406066963e-06, + "loss": 1.1012, + "step": 45 + }, + { + "epoch": 3.48, + "eval_loss": 1.2118505239486694, + "eval_runtime": 29.5358, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 1.693, + "step": 45 + }, + { + "epoch": 3.56, + "grad_norm": 0.06772288680076599, + "learning_rate": 1.3638696597277679e-06, + "loss": 1.1264, + "step": 46 + }, + { + "epoch": 3.64, + "grad_norm": 0.06901554763317108, + "learning_rate": 3.415506993330153e-07, + "loss": 1.1457, + "step": 47 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 0.06979726254940033, + "learning_rate": 0.0, + "loss": 1.1891, + "step": 48 + }, + { + "epoch": 3.7199999999999998, + "eval_loss": 1.2122316360473633, + "eval_runtime": 29.5331, + "eval_samples_per_second": 3.386, + "eval_steps_per_second": 1.693, + "step": 48 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9974952442724352.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-48/training_args.bin b/checkpoint-48/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8d1abf3674a169f158217fe8beb0d6440c93b7ed --- /dev/null +++ b/checkpoint-48/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9de536148fb50b99c1a1c0fda0b62700ebddf19e423baa229b84d9504c8284b +size 5944 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a50c0f91ba9ec2bf031adf7aaf6c9ab855b30884 --- /dev/null +++ b/config.json @@ -0,0 +1,44 @@ +{ + "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": false, + "_load_in_8bit": true, + "bnb_4bit_compute_dtype": "float32", + "bnb_4bit_quant_storage": "uint8", + "bnb_4bit_quant_type": "fp4", + "bnb_4bit_use_double_quant": false, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": false, + "load_in_8bit": true, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.41.1", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +}