Fizzarolli
commited on
Commit
•
fcac909
1
Parent(s):
9405a0a
Update README.md
Browse files
README.md
CHANGED
@@ -4,173 +4,33 @@ license: apache-2.0
|
|
4 |
base_model: ibm-granite/granite-3.0-1b-a400m-base
|
5 |
tags:
|
6 |
- axolotl
|
7 |
-
-
|
|
|
8 |
model-index:
|
9 |
-
- name:
|
10 |
results: []
|
11 |
---
|
12 |
|
13 |
-
|
14 |
-
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
# Weights and Biases logging coinfig
|
22 |
-
wandb_project: sexy-moe-girl_400MA_1BT-2
|
23 |
-
# wandb_entity:
|
24 |
-
# wandb_watch: all
|
25 |
-
wandb_name: v1
|
26 |
-
# wandb_log_model:
|
27 |
-
|
28 |
-
# Model architecture config
|
29 |
-
base_model: ibm-granite/granite-3.0-1b-a400m-base
|
30 |
-
model_type: AutoModelForCausalLM
|
31 |
-
tokenizer_type: AutoTokenizer
|
32 |
-
|
33 |
-
# Hugging Face saving config
|
34 |
-
hub_model_id: allura-org/sexy-moe-girl_400MA_1BT-ckpts
|
35 |
-
hub_strategy: every_save
|
36 |
-
push_dataset_to_hub:
|
37 |
-
hf_use_auth_token: true
|
38 |
-
|
39 |
-
# Model checkpointing config
|
40 |
-
output_dir: out
|
41 |
-
resume_from_checkpoint:
|
42 |
-
save_steps:
|
43 |
-
saves_per_epoch: 5
|
44 |
-
save_safetensors: true
|
45 |
-
save_total_limit: 5
|
46 |
-
|
47 |
-
# Mixed precision training config
|
48 |
-
bf16: true
|
49 |
-
fp16: false
|
50 |
-
tf32: false
|
51 |
-
|
52 |
-
# Model loading config
|
53 |
-
load_in_8bit: false
|
54 |
-
load_in_4bit: false
|
55 |
-
strict: false
|
56 |
-
|
57 |
-
# Sequence config
|
58 |
-
sequence_len: 8192
|
59 |
-
s2_attention: false
|
60 |
-
sample_packing: true # true # false
|
61 |
-
eval_sample_packing: false # true
|
62 |
-
pad_to_sequence_len: false #true # false
|
63 |
-
train_on_inputs: true
|
64 |
-
group_by_length: false
|
65 |
-
|
66 |
-
# Unfrozen parameters for FFT
|
67 |
-
unfrozen_parameters:
|
68 |
-
|
69 |
-
# Dataset config
|
70 |
-
chat_template: chatml
|
71 |
-
datasets:
|
72 |
-
- path: Fizzarolli/special-sauce
|
73 |
-
type: sharegpt
|
74 |
-
chat_template: chatml
|
75 |
-
#val_set_size: 0.05
|
76 |
-
# evaluation_strategy: steps
|
77 |
-
# eval_steps:
|
78 |
-
#evals_per_epoch: 5
|
79 |
-
# test_datasets:
|
80 |
-
dataset_prepared_path: last_run_prepared
|
81 |
-
shuffle_merged_datasets: true
|
82 |
-
|
83 |
-
# Training hyperparameters
|
84 |
-
num_epochs: 2
|
85 |
-
gradient_accumulation_steps: 4
|
86 |
-
micro_batch_size: 8
|
87 |
-
warmup_steps: 150
|
88 |
-
optimizer: schedule_free_adamw
|
89 |
-
lr_scheduler: constant_with_warmup
|
90 |
-
learning_rate: 0.00002
|
91 |
-
weight_decay: 0.1
|
92 |
-
max_grad_norm: 1.0
|
93 |
-
logging_steps: 1
|
94 |
-
|
95 |
-
# # Model optimization / unsloth ---- INSTALL UNSLOTH
|
96 |
-
gradient_checkpointing: unsloth
|
97 |
-
#
|
98 |
-
# unsloth_cross_entropy_loss: true
|
99 |
-
# unsloth_lora_mlp: true
|
100 |
-
# unsloth_lora_qkv: true
|
101 |
-
# unsloth_lora_o: true
|
102 |
-
#plugins:
|
103 |
-
# - axolotl.integrations.liger.LigerPlugin
|
104 |
-
#liger_rope: true
|
105 |
-
#liger_rms_norm: true
|
106 |
-
#liger_swiglu: true
|
107 |
-
#liger_fused_linear_cross_entropy: true
|
108 |
-
|
109 |
-
xformers_attention: false
|
110 |
-
flash_attention: true
|
111 |
-
sdp_attention: false
|
112 |
-
|
113 |
-
# Loss monitoring config
|
114 |
-
early_stopping_patience: false
|
115 |
-
loss_watchdog_threshold: 100.0
|
116 |
-
loss_watchdog_patience: 3
|
117 |
-
|
118 |
-
# Debug config
|
119 |
-
debug: true
|
120 |
-
seed: 1001 # 42
|
121 |
-
|
122 |
-
special_tokens:
|
123 |
-
eos_token: "<|im_end|>"
|
124 |
-
bos_token: "<|endoftext|>"
|
125 |
-
tokens: # these are delimiters
|
126 |
-
- "<|im_start|>"
|
127 |
-
|
128 |
-
# Don't mess with this, it's here for accelerate and torchrun
|
129 |
-
local_rank:
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
```
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
# sexy-moe-girl_400MA_1BT-ckpts
|
136 |
-
|
137 |
-
This model is a fine-tuned version of [ibm-granite/granite-3.0-1b-a400m-base](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) on the None dataset.
|
138 |
-
|
139 |
-
## Model description
|
140 |
-
|
141 |
-
More information needed
|
142 |
-
|
143 |
-
## Intended uses & limitations
|
144 |
-
|
145 |
-
More information needed
|
146 |
-
|
147 |
-
## Training and evaluation data
|
148 |
-
|
149 |
-
More information needed
|
150 |
-
|
151 |
-
## Training procedure
|
152 |
-
|
153 |
-
### Training hyperparameters
|
154 |
-
|
155 |
-
The following hyperparameters were used during training:
|
156 |
-
- learning_rate: 2e-05
|
157 |
-
- train_batch_size: 8
|
158 |
-
- eval_batch_size: 8
|
159 |
-
- seed: 1001
|
160 |
-
- gradient_accumulation_steps: 4
|
161 |
-
- total_train_batch_size: 32
|
162 |
-
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
163 |
-
- lr_scheduler_type: constant_with_warmup
|
164 |
-
- lr_scheduler_warmup_steps: 150
|
165 |
-
- num_epochs: 2
|
166 |
-
|
167 |
-
### Training results
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
### Framework versions
|
172 |
-
|
173 |
-
- Transformers 4.45.2
|
174 |
-
- Pytorch 2.4.1+cu124
|
175 |
-
- Datasets 3.0.1
|
176 |
-
- Tokenizers 0.20.1
|
|
|
4 |
base_model: ibm-granite/granite-3.0-1b-a400m-base
|
5 |
tags:
|
6 |
- axolotl
|
7 |
+
- moe
|
8 |
+
- roleplay
|
9 |
model-index:
|
10 |
+
- name: MoE_Girl_400MA_1BT
|
11 |
results: []
|
12 |
---
|
13 |
|
14 |
+
# MoE Girl 400mA 1bT
|
15 |
+
![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/kTXXSSSqpb21rfyOX7FUa.jpeg)
|
16 |
+
A finetune of Granite 3.0 by IBM designed for roleplaying (and maybe general usecases if you try hard enough).
|
17 |
|
18 |
+
## Disclaimer
|
19 |
+
PLEASE do not expect godliness out of this, it's a model with _400 million_ active parameters. Expect something more akin to GPT-2.
|
20 |
|
21 |
+
## Quants
|
22 |
+
TODO!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
## Prompting
|
25 |
+
Use ChatML.
|
26 |
+
```
|
27 |
+
<|im_start|>system
|
28 |
+
You are a helpful assistant who talks like a pirate.<|im_end|>
|
29 |
+
<|im_start|>user
|
30 |
+
Hello there!<|im_end|>
|
31 |
+
<|im_start|>assistant
|
32 |
+
Yarr harr harr, me matey!<|im_end|>
|
33 |
```
|
34 |
|
35 |
+
## Thanks
|
36 |
+
Special thanks to the members of Allura for testing and emotional support, as well as the creators of all the datasets that were used in the Special Sauce used to train this model. I love you all <3 - Fizz
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|