Fizzarolli commited on
Commit
9405a0a
1 Parent(s): eda687c

End of training

Browse files
Files changed (1) hide show
  1. README.md +176 -0
README.md ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: ibm-granite/granite-3.0-1b-a400m-base
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: sexy-moe-girl_400MA_1BT-ckpts
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
17
+ <details><summary>See axolotl config</summary>
18
+
19
+ axolotl version: `0.4.1`
20
+ ```yaml
21
+ # Weights and Biases logging coinfig
22
+ wandb_project: sexy-moe-girl_400MA_1BT-2
23
+ # wandb_entity:
24
+ # wandb_watch: all
25
+ wandb_name: v1
26
+ # wandb_log_model:
27
+
28
+ # Model architecture config
29
+ base_model: ibm-granite/granite-3.0-1b-a400m-base
30
+ model_type: AutoModelForCausalLM
31
+ tokenizer_type: AutoTokenizer
32
+
33
+ # Hugging Face saving config
34
+ hub_model_id: allura-org/sexy-moe-girl_400MA_1BT-ckpts
35
+ hub_strategy: every_save
36
+ push_dataset_to_hub:
37
+ hf_use_auth_token: true
38
+
39
+ # Model checkpointing config
40
+ output_dir: out
41
+ resume_from_checkpoint:
42
+ save_steps:
43
+ saves_per_epoch: 5
44
+ save_safetensors: true
45
+ save_total_limit: 5
46
+
47
+ # Mixed precision training config
48
+ bf16: true
49
+ fp16: false
50
+ tf32: false
51
+
52
+ # Model loading config
53
+ load_in_8bit: false
54
+ load_in_4bit: false
55
+ strict: false
56
+
57
+ # Sequence config
58
+ sequence_len: 8192
59
+ s2_attention: false
60
+ sample_packing: true # true # false
61
+ eval_sample_packing: false # true
62
+ pad_to_sequence_len: false #true # false
63
+ train_on_inputs: true
64
+ group_by_length: false
65
+
66
+ # Unfrozen parameters for FFT
67
+ unfrozen_parameters:
68
+
69
+ # Dataset config
70
+ chat_template: chatml
71
+ datasets:
72
+ - path: Fizzarolli/special-sauce
73
+ type: sharegpt
74
+ chat_template: chatml
75
+ #val_set_size: 0.05
76
+ # evaluation_strategy: steps
77
+ # eval_steps:
78
+ #evals_per_epoch: 5
79
+ # test_datasets:
80
+ dataset_prepared_path: last_run_prepared
81
+ shuffle_merged_datasets: true
82
+
83
+ # Training hyperparameters
84
+ num_epochs: 2
85
+ gradient_accumulation_steps: 4
86
+ micro_batch_size: 8
87
+ warmup_steps: 150
88
+ optimizer: schedule_free_adamw
89
+ lr_scheduler: constant_with_warmup
90
+ learning_rate: 0.00002
91
+ weight_decay: 0.1
92
+ max_grad_norm: 1.0
93
+ logging_steps: 1
94
+
95
+ # # Model optimization / unsloth ---- INSTALL UNSLOTH
96
+ gradient_checkpointing: unsloth
97
+ #
98
+ # unsloth_cross_entropy_loss: true
99
+ # unsloth_lora_mlp: true
100
+ # unsloth_lora_qkv: true
101
+ # unsloth_lora_o: true
102
+ #plugins:
103
+ # - axolotl.integrations.liger.LigerPlugin
104
+ #liger_rope: true
105
+ #liger_rms_norm: true
106
+ #liger_swiglu: true
107
+ #liger_fused_linear_cross_entropy: true
108
+
109
+ xformers_attention: false
110
+ flash_attention: true
111
+ sdp_attention: false
112
+
113
+ # Loss monitoring config
114
+ early_stopping_patience: false
115
+ loss_watchdog_threshold: 100.0
116
+ loss_watchdog_patience: 3
117
+
118
+ # Debug config
119
+ debug: true
120
+ seed: 1001 # 42
121
+
122
+ special_tokens:
123
+ eos_token: "<|im_end|>"
124
+ bos_token: "<|endoftext|>"
125
+ tokens: # these are delimiters
126
+ - "<|im_start|>"
127
+
128
+ # Don't mess with this, it's here for accelerate and torchrun
129
+ local_rank:
130
+
131
+ ```
132
+
133
+ </details><br>
134
+
135
+ # sexy-moe-girl_400MA_1BT-ckpts
136
+
137
+ This model is a fine-tuned version of [ibm-granite/granite-3.0-1b-a400m-base](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) on the None dataset.
138
+
139
+ ## Model description
140
+
141
+ More information needed
142
+
143
+ ## Intended uses & limitations
144
+
145
+ More information needed
146
+
147
+ ## Training and evaluation data
148
+
149
+ More information needed
150
+
151
+ ## Training procedure
152
+
153
+ ### Training hyperparameters
154
+
155
+ The following hyperparameters were used during training:
156
+ - learning_rate: 2e-05
157
+ - train_batch_size: 8
158
+ - eval_batch_size: 8
159
+ - seed: 1001
160
+ - gradient_accumulation_steps: 4
161
+ - total_train_batch_size: 32
162
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
163
+ - lr_scheduler_type: constant_with_warmup
164
+ - lr_scheduler_warmup_steps: 150
165
+ - num_epochs: 2
166
+
167
+ ### Training results
168
+
169
+
170
+
171
+ ### Framework versions
172
+
173
+ - Transformers 4.45.2
174
+ - Pytorch 2.4.1+cu124
175
+ - Datasets 3.0.1
176
+ - Tokenizers 0.20.1