Mark-Arcee commited on
Commit
b140801
1 Parent(s): f139b1f

Model save

Browse files
README.md CHANGED
@@ -2,16 +2,12 @@
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - alignment-handbook
12
  - generated_from_trainer
13
  datasets:
14
- - arcee-ai/Zilo-Filtered-SQL-Instruct
15
  model-index:
16
  - name: zilo-instruct-v2-sft-filtered
17
  results: []
@@ -22,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # zilo-instruct-v2-sft-filtered
24
 
25
- This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the arcee-ai/Zilo-Filtered-SQL-Instruct dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.4072
28
 
29
  ## Model description
30
 
@@ -60,9 +56,9 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:-----:|:----:|:---------------:|
63
- | 0.6043 | 1.0 | 15 | 0.5420 |
64
- | 0.3922 | 2.0 | 30 | 0.4157 |
65
- | 0.2791 | 3.0 | 45 | 0.4072 |
66
 
67
 
68
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-Instruct-v0.2
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: zilo-instruct-v2-sft-filtered
13
  results: []
 
18
 
19
  # zilo-instruct-v2-sft-filtered
20
 
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.5474
24
 
25
  ## Model description
26
 
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
+ | 1.8602 | 1.0 | 7 | 1.3280 |
60
+ | 2.9102 | 2.0 | 14 | 0.6256 |
61
+ | 0.5329 | 3.0 | 21 | 0.5474 |
62
 
63
 
64
  ### Framework versions
all_results.json CHANGED
@@ -5,10 +5,10 @@
5
  "eval_samples": 2958,
6
  "eval_samples_per_second": 49.046,
7
  "eval_steps_per_second": 0.986,
8
- "total_flos": 18844169011200.0,
9
- "train_loss": 0.5649417373869154,
10
- "train_runtime": 363.236,
11
- "train_samples": 26621,
12
- "train_samples_per_second": 14.85,
13
- "train_steps_per_second": 0.124
14
  }
 
5
  "eval_samples": 2958,
6
  "eval_samples_per_second": 49.046,
7
  "eval_steps_per_second": 0.986,
8
+ "total_flos": 8793945538560.0,
9
+ "train_loss": 1.4474536804925828,
10
+ "train_runtime": 170.8306,
11
+ "train_samples": 12338,
12
+ "train_samples_per_second": 13.821,
13
+ "train_steps_per_second": 0.123
14
  }
config.json CHANGED
@@ -21,6 +21,6 @@
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.41.1",
24
- "use_cache": true,
25
  "vocab_size": 32000
26
  }
 
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.41.1",
24
+ "use_cache": false,
25
  "vocab_size": 32000
26
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce24b378ae43fe02b92e39dd4d0181fc4821e81b76e1ab3b48feb485cee8ad9a
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bdc26479b269dae211f4fb563e33dc9a9e321a8c077d99b7dd886866e5a0497
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:767652637ec5730e9771e3f2929b36cf446fdb16382700a907dde817a6c9b649
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b5bd3f865efc671963fb864ab3b68c1a96cef2e962caa42aa0f5468d01239b
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90277adea4cd1317fb6624f070f075f16342cb8b18b00043f18cb5db49b56010
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27f029b043e2a1a93f6f20165da76cb1cfa9516b9209497f4251e497cfe43580
3
  size 4540516344
runs/May25_01-35-59_5c31577a2818/events.out.tfevents.1716600985.5c31577a2818.16735.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55a671103226617f87cac9231aeffb3ef62914cd8bfdd7c80312ed1fe8973e97
3
+ size 7156
runs/May25_01-40-07_5c31577a2818/events.out.tfevents.1716601219.5c31577a2818.17665.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6d92a72ffe3e45c7603637c9ac3a32d6af5afe2d2a3ca6330f71e60d3d47cc
3
+ size 7156
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 18844169011200.0,
4
- "train_loss": 0.5649417373869154,
5
- "train_runtime": 363.236,
6
- "train_samples": 26621,
7
- "train_samples_per_second": 14.85,
8
- "train_steps_per_second": 0.124
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 8793945538560.0,
4
+ "train_loss": 1.4474536804925828,
5
+ "train_runtime": 170.8306,
6
+ "train_samples": 12338,
7
+ "train_samples_per_second": 13.821,
8
+ "train_steps_per_second": 0.123
9
  }
trainer_state.json CHANGED
@@ -3,117 +3,82 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 45,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.06666666666666667,
13
- "grad_norm": 98.21472515172992,
14
- "learning_rate": 4.000000000000001e-06,
15
- "loss": 1.8595,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.3333333333333333,
20
- "grad_norm": 30.799283162433554,
21
- "learning_rate": 2e-05,
22
- "loss": 1.3856,
23
  "step": 5
24
  },
25
- {
26
- "epoch": 0.6666666666666666,
27
- "grad_norm": 7.8774399080468624,
28
- "learning_rate": 1.9238795325112867e-05,
29
- "loss": 0.8251,
30
- "step": 10
31
- },
32
- {
33
- "epoch": 1.0,
34
- "grad_norm": 4.085781086210361,
35
- "learning_rate": 1.7071067811865477e-05,
36
- "loss": 0.6043,
37
- "step": 15
38
- },
39
  {
40
  "epoch": 1.0,
41
- "eval_loss": 0.5420283079147339,
42
- "eval_runtime": 4.2361,
43
- "eval_samples_per_second": 46.977,
44
- "eval_steps_per_second": 0.944,
45
- "step": 15
46
- },
47
- {
48
- "epoch": 1.3333333333333333,
49
- "grad_norm": 2.0713099775720534,
50
- "learning_rate": 1.3826834323650899e-05,
51
- "loss": 0.4794,
52
- "step": 20
53
- },
54
- {
55
- "epoch": 1.6666666666666665,
56
- "grad_norm": 1.3136102966664787,
57
- "learning_rate": 1e-05,
58
- "loss": 0.4248,
59
- "step": 25
60
  },
61
  {
62
- "epoch": 2.0,
63
- "grad_norm": 1.1912342263078068,
64
- "learning_rate": 6.173165676349103e-06,
65
- "loss": 0.3922,
66
- "step": 30
67
  },
68
  {
69
  "epoch": 2.0,
70
- "eval_loss": 0.4156629145145416,
71
- "eval_runtime": 4.0917,
72
- "eval_samples_per_second": 48.635,
73
- "eval_steps_per_second": 0.978,
74
- "step": 30
75
  },
76
  {
77
- "epoch": 2.3333333333333335,
78
- "grad_norm": 1.1730848362574522,
79
- "learning_rate": 2.9289321881345257e-06,
80
- "loss": 0.3075,
81
- "step": 35
82
- },
83
- {
84
- "epoch": 2.6666666666666665,
85
- "grad_norm": 0.9598398180975349,
86
- "learning_rate": 7.612046748871327e-07,
87
- "loss": 0.2917,
88
- "step": 40
89
  },
90
  {
91
- "epoch": 3.0,
92
- "grad_norm": 1.3661284541735195,
93
- "learning_rate": 0.0,
94
- "loss": 0.2791,
95
- "step": 45
96
  },
97
  {
98
  "epoch": 3.0,
99
- "eval_loss": 0.40715718269348145,
100
- "eval_runtime": 4.0987,
101
- "eval_samples_per_second": 48.552,
102
- "eval_steps_per_second": 0.976,
103
- "step": 45
104
  },
105
  {
106
  "epoch": 3.0,
107
- "step": 45,
108
- "total_flos": 18844169011200.0,
109
- "train_loss": 0.5649417373869154,
110
- "train_runtime": 363.236,
111
- "train_samples_per_second": 14.85,
112
- "train_steps_per_second": 0.124
113
  }
114
  ],
115
  "logging_steps": 5,
116
- "max_steps": 45,
117
  "num_input_tokens_seen": 0,
118
  "num_train_epochs": 3,
119
  "save_steps": 100,
@@ -129,7 +94,7 @@
129
  "attributes": {}
130
  }
131
  },
132
- "total_flos": 18844169011200.0,
133
  "train_batch_size": 16,
134
  "trial_name": null,
135
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 21,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.14285714285714285,
13
+ "grad_norm": 99.75653578103409,
14
+ "learning_rate": 6.666666666666667e-06,
15
+ "loss": 1.7828,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.7142857142857143,
20
+ "grad_norm": 421.35114809358015,
21
+ "learning_rate": 1.9396926207859085e-05,
22
+ "loss": 1.8602,
23
  "step": 5
24
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  {
26
  "epoch": 1.0,
27
+ "eval_loss": 1.3279948234558105,
28
+ "eval_runtime": 2.2021,
29
+ "eval_samples_per_second": 39.054,
30
+ "eval_steps_per_second": 0.908,
31
+ "step": 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
  {
34
+ "epoch": 1.4285714285714286,
35
+ "grad_norm": 60.4301410351058,
36
+ "learning_rate": 1.342020143325669e-05,
37
+ "loss": 2.9102,
38
+ "step": 10
39
  },
40
  {
41
  "epoch": 2.0,
42
+ "eval_loss": 0.6256123781204224,
43
+ "eval_runtime": 2.0607,
44
+ "eval_samples_per_second": 41.733,
45
+ "eval_steps_per_second": 0.971,
46
+ "step": 14
47
  },
48
  {
49
+ "epoch": 2.142857142857143,
50
+ "grad_norm": 9.507888146594869,
51
+ "learning_rate": 5.000000000000003e-06,
52
+ "loss": 0.686,
53
+ "step": 15
 
 
 
 
 
 
 
54
  },
55
  {
56
+ "epoch": 2.857142857142857,
57
+ "grad_norm": 1.2620367321960468,
58
+ "learning_rate": 1.519224698779198e-07,
59
+ "loss": 0.5329,
60
+ "step": 20
61
  },
62
  {
63
  "epoch": 3.0,
64
+ "eval_loss": 0.5474353432655334,
65
+ "eval_runtime": 2.0545,
66
+ "eval_samples_per_second": 41.86,
67
+ "eval_steps_per_second": 0.973,
68
+ "step": 21
69
  },
70
  {
71
  "epoch": 3.0,
72
+ "step": 21,
73
+ "total_flos": 8793945538560.0,
74
+ "train_loss": 1.4474536804925828,
75
+ "train_runtime": 170.8306,
76
+ "train_samples_per_second": 13.821,
77
+ "train_steps_per_second": 0.123
78
  }
79
  ],
80
  "logging_steps": 5,
81
+ "max_steps": 21,
82
  "num_input_tokens_seen": 0,
83
  "num_train_epochs": 3,
84
  "save_steps": 100,
 
94
  "attributes": {}
95
  }
96
  },
97
+ "total_flos": 8793945538560.0,
98
  "train_batch_size": 16,
99
  "trial_name": null,
100
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16468e20b0c8c5a6af8b75ded9d347ce0856a32d5e19a9f872bac737dba7ae17
3
  size 6392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c16c304b54e234c167d4a62517da9fcb81b63bf76374bb1ffd8fd038494174c9
3
  size 6392