PeterJinGo commited on
Commit
e0e1fc0
1 Parent(s): 5c1d958

Model save

Browse files
Files changed (4) hide show
  1. README.md +3 -5
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +611 -114
README.md CHANGED
@@ -4,8 +4,6 @@ tags:
4
  - trl
5
  - sft
6
  - generated_from_trainer
7
- datasets:
8
- - generator
9
  model-index:
10
  - name: zephyr-7b-sft-full-sftwdpodata
11
  results: []
@@ -16,9 +14,9 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # zephyr-7b-sft-full-sftwdpodata
18
 
19
- This model was trained from scratch on the generator dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.1374
22
 
23
  ## Model description
24
 
@@ -54,7 +52,7 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.1588 | 1.0 | 123 | 1.1374 |
58
 
59
 
60
  ### Framework versions
 
4
  - trl
5
  - sft
6
  - generated_from_trainer
 
 
7
  model-index:
8
  - name: zephyr-7b-sft-full-sftwdpodata
9
  results: []
 
14
 
15
  # zephyr-7b-sft-full-sftwdpodata
16
 
17
+ This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 1.0375
20
 
21
  ## Model description
22
 
 
52
 
53
  | Training Loss | Epoch | Step | Validation Loss |
54
  |:-------------:|:-----:|:----:|:---------------:|
55
+ | 1.0685 | 1.0 | 478 | 1.0375 |
56
 
57
 
58
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 51507395297280.0,
4
- "train_loss": 1.1927482141711847,
5
- "train_runtime": 664.3286,
6
  "train_samples": 61134,
7
- "train_samples_per_second": 23.68,
8
- "train_steps_per_second": 0.185
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 97516887736320.0,
4
+ "train_loss": 1.1027952946379593,
5
+ "train_runtime": 1452.993,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 42.075,
8
+ "train_steps_per_second": 0.329
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 51507395297280.0,
4
- "train_loss": 1.1927482141711847,
5
- "train_runtime": 664.3286,
6
  "train_samples": 61134,
7
- "train_samples_per_second": 23.68,
8
- "train_steps_per_second": 0.185
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 97516887736320.0,
4
+ "train_loss": 1.1027952946379593,
5
+ "train_runtime": 1452.993,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 42.075,
8
+ "train_steps_per_second": 0.329
9
  }
trainer_state.json CHANGED
@@ -3,206 +3,703 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 123,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.008130081300813009,
13
- "grad_norm": 5.968609245665954,
14
- "learning_rate": 3.846153846153846e-08,
15
- "loss": 1.3047,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.04065040650406504,
20
- "grad_norm": 6.622098028585483,
21
- "learning_rate": 1.9230769230769231e-07,
22
- "loss": 1.3293,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.08130081300813008,
27
- "grad_norm": 5.601752396412818,
28
- "learning_rate": 3.8461538461538463e-07,
29
- "loss": 1.3265,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.12195121951219512,
34
- "grad_norm": 3.3465461495166955,
35
- "learning_rate": 4.995922759815338e-07,
36
- "loss": 1.3014,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.16260162601626016,
41
- "grad_norm": 2.285061060375805,
42
- "learning_rate": 4.950206402730983e-07,
43
- "loss": 1.267,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.2032520325203252,
48
- "grad_norm": 2.906521061514324,
49
- "learning_rate": 4.854610909098811e-07,
50
- "loss": 1.2635,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.24390243902439024,
55
- "grad_norm": 2.2161224244788196,
56
- "learning_rate": 4.7110823274945357e-07,
57
- "loss": 1.2199,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.2845528455284553,
62
- "grad_norm": 1.9040029033032413,
63
- "learning_rate": 4.5225424859373684e-07,
64
- "loss": 1.2142,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.3252032520325203,
69
- "grad_norm": 1.8144925748816008,
70
- "learning_rate": 4.292829511897409e-07,
71
- "loss": 1.2051,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.36585365853658536,
76
- "grad_norm": 1.6349825973341405,
77
- "learning_rate": 4.0266196990885955e-07,
78
- "loss": 1.1723,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.4065040650406504,
83
- "grad_norm": 1.664912835299134,
84
- "learning_rate": 3.72933231161401e-07,
85
- "loss": 1.177,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.44715447154471544,
90
- "grad_norm": 1.6158486513675248,
91
- "learning_rate": 3.407019263376602e-07,
92
- "loss": 1.1748,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.4878048780487805,
97
- "grad_norm": 1.661677150645168,
98
- "learning_rate": 3.0662419185644114e-07,
99
- "loss": 1.1791,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.5284552845528455,
104
- "grad_norm": 1.5725357820334132,
105
- "learning_rate": 2.7139375211970995e-07,
106
- "loss": 1.1598,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.5691056910569106,
111
- "grad_norm": 1.621045635617722,
112
- "learning_rate": 2.3572779728430797e-07,
113
- "loss": 1.166,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.6097560975609756,
118
- "grad_norm": 1.6041377989080807,
119
- "learning_rate": 2.0035238333856368e-07,
120
- "loss": 1.1698,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.6504065040650406,
125
- "grad_norm": 1.6387599262025716,
126
- "learning_rate": 1.6598765169614244e-07,
127
- "loss": 1.1467,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.6910569105691057,
132
- "grad_norm": 1.5728900163922992,
133
- "learning_rate": 1.3333316919358157e-07,
134
- "loss": 1.1588,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.7317073170731707,
139
- "grad_norm": 1.630523905646432,
140
- "learning_rate": 1.0305368692688174e-07,
141
- "loss": 1.1511,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.7723577235772358,
146
- "grad_norm": 1.6142247634044153,
147
- "learning_rate": 7.576560783617667e-08,
148
- "loss": 1.1514,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.8130081300813008,
153
- "grad_norm": 1.6635344099346825,
154
- "learning_rate": 5.202443851943125e-08,
155
- "loss": 1.1339,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.8536585365853658,
160
- "grad_norm": 1.6142940019540397,
161
- "learning_rate": 3.231348072005574e-08,
162
- "loss": 1.1436,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.8943089430894309,
167
- "grad_norm": 1.7064997556493062,
168
- "learning_rate": 1.7033992697136928e-08,
169
- "loss": 1.1522,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.9349593495934959,
174
- "grad_norm": 1.5806639080165765,
175
- "learning_rate": 6.497020764416633e-09,
176
- "loss": 1.1297,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.975609756097561,
181
- "grad_norm": 1.649978945004722,
182
- "learning_rate": 9.170672843271666e-10,
183
- "loss": 1.1588,
184
  "step": 120
185
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  {
187
  "epoch": 1.0,
188
- "eval_loss": 1.137434482574463,
189
- "eval_runtime": 4.7008,
190
- "eval_samples_per_second": 108.067,
191
- "eval_steps_per_second": 1.702,
192
- "step": 123
193
  },
194
  {
195
  "epoch": 1.0,
196
- "step": 123,
197
- "total_flos": 51507395297280.0,
198
- "train_loss": 1.1927482141711847,
199
- "train_runtime": 664.3286,
200
- "train_samples_per_second": 23.68,
201
- "train_steps_per_second": 0.185
202
  }
203
  ],
204
  "logging_steps": 5,
205
- "max_steps": 123,
206
  "num_input_tokens_seen": 0,
207
  "num_train_epochs": 1,
208
  "save_steps": 100,
@@ -218,7 +715,7 @@
218
  "attributes": {}
219
  }
220
  },
221
- "total_flos": 51507395297280.0,
222
  "train_batch_size": 16,
223
  "trial_name": null,
224
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 478,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0020920502092050207,
13
+ "grad_norm": 8.136081343052572,
14
+ "learning_rate": 1.0416666666666666e-08,
15
+ "loss": 1.2533,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.010460251046025104,
20
+ "grad_norm": 10.869224095264249,
21
+ "learning_rate": 5.208333333333333e-08,
22
+ "loss": 1.3074,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.02092050209205021,
27
+ "grad_norm": 9.394947250234965,
28
+ "learning_rate": 1.0416666666666667e-07,
29
+ "loss": 1.3511,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.03138075313807531,
34
+ "grad_norm": 8.48453952211364,
35
+ "learning_rate": 1.5624999999999999e-07,
36
+ "loss": 1.3286,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.04184100418410042,
41
+ "grad_norm": 9.807884523848902,
42
+ "learning_rate": 2.0833333333333333e-07,
43
+ "loss": 1.2539,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.05230125523012552,
48
+ "grad_norm": 6.770187197984029,
49
+ "learning_rate": 2.604166666666667e-07,
50
+ "loss": 1.2428,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.06276150627615062,
55
+ "grad_norm": 4.388418711512584,
56
+ "learning_rate": 3.1249999999999997e-07,
57
+ "loss": 1.2573,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.07322175732217573,
62
+ "grad_norm": 3.7767106766677507,
63
+ "learning_rate": 3.645833333333333e-07,
64
+ "loss": 1.254,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.08368200836820083,
69
+ "grad_norm": 3.8857851665569503,
70
+ "learning_rate": 4.1666666666666667e-07,
71
+ "loss": 1.193,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.09414225941422594,
76
+ "grad_norm": 3.6981444602921743,
77
+ "learning_rate": 4.6874999999999996e-07,
78
+ "loss": 1.2297,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.10460251046025104,
83
+ "grad_norm": 3.164525008947102,
84
+ "learning_rate": 4.999733114418725e-07,
85
+ "loss": 1.2045,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.11506276150627615,
90
+ "grad_norm": 3.3438200347539975,
91
+ "learning_rate": 4.996731305997416e-07,
92
+ "loss": 1.2055,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.12552301255230125,
97
+ "grad_norm": 3.4475160228942117,
98
+ "learning_rate": 4.990398100856366e-07,
99
+ "loss": 1.1582,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.13598326359832635,
104
+ "grad_norm": 4.990341002466793,
105
+ "learning_rate": 4.980741949411839e-07,
106
+ "loss": 1.169,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.14644351464435146,
111
+ "grad_norm": 3.0806862112851725,
112
+ "learning_rate": 4.967775735898179e-07,
113
+ "loss": 1.1512,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.15690376569037656,
118
+ "grad_norm": 3.2012998938874717,
119
+ "learning_rate": 4.951516761176343e-07,
120
+ "loss": 1.1196,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.16736401673640167,
125
+ "grad_norm": 3.254309758852426,
126
+ "learning_rate": 4.931986719649298e-07,
127
+ "loss": 1.1321,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.17782426778242677,
132
+ "grad_norm": 3.521038876761785,
133
+ "learning_rate": 4.909211670315114e-07,
134
+ "loss": 1.176,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.18828451882845187,
139
+ "grad_norm": 3.1094901411244407,
140
+ "learning_rate": 4.883222001996351e-07,
141
+ "loss": 1.0846,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.19874476987447698,
146
+ "grad_norm": 3.944726072257309,
147
+ "learning_rate": 4.854052392792161e-07,
148
+ "loss": 1.1643,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.20920502092050208,
153
+ "grad_norm": 3.0199061104695275,
154
+ "learning_rate": 4.821741763807186e-07,
155
+ "loss": 1.1587,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.2196652719665272,
160
+ "grad_norm": 3.0664109423340156,
161
+ "learning_rate": 4.786333227218995e-07,
162
+ "loss": 1.1017,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.2301255230125523,
167
+ "grad_norm": 3.5363910368317404,
168
+ "learning_rate": 4.747874028753375e-07,
169
+ "loss": 1.118,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.2405857740585774,
174
+ "grad_norm": 3.2790793251193913,
175
+ "learning_rate": 4.706415484644195e-07,
176
+ "loss": 1.1024,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.2510460251046025,
181
+ "grad_norm": 3.1085664448119616,
182
+ "learning_rate": 4.662012913161997e-07,
183
+ "loss": 1.1253,
184
  "step": 120
185
  },
186
+ {
187
+ "epoch": 0.2615062761506276,
188
+ "grad_norm": 3.17942285679677,
189
+ "learning_rate": 4.614725560802639e-07,
190
+ "loss": 1.1144,
191
+ "step": 125
192
+ },
193
+ {
194
+ "epoch": 0.2719665271966527,
195
+ "grad_norm": 2.9359440437528233,
196
+ "learning_rate": 4.5646165232345103e-07,
197
+ "loss": 1.0742,
198
+ "step": 130
199
+ },
200
+ {
201
+ "epoch": 0.2824267782426778,
202
+ "grad_norm": 3.378239775065935,
203
+ "learning_rate": 4.511752661109768e-07,
204
+ "loss": 1.1246,
205
+ "step": 135
206
+ },
207
+ {
208
+ "epoch": 0.2928870292887029,
209
+ "grad_norm": 3.0222815305213646,
210
+ "learning_rate": 4.456204510851956e-07,
211
+ "loss": 1.0949,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.303347280334728,
216
+ "grad_norm": 3.305312909236648,
217
+ "learning_rate": 4.398046190539024e-07,
218
+ "loss": 1.1114,
219
+ "step": 145
220
+ },
221
+ {
222
+ "epoch": 0.3138075313807531,
223
+ "grad_norm": 3.31590711665521,
224
+ "learning_rate": 4.337355301007335e-07,
225
+ "loss": 1.1266,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.32426778242677823,
230
+ "grad_norm": 3.7988390548789135,
231
+ "learning_rate": 4.2742128223086115e-07,
232
+ "loss": 1.0817,
233
+ "step": 155
234
+ },
235
+ {
236
+ "epoch": 0.33472803347280333,
237
+ "grad_norm": 3.5053528166093195,
238
+ "learning_rate": 4.2087030056579986e-07,
239
+ "loss": 1.1247,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.34518828451882844,
244
+ "grad_norm": 3.002251062725114,
245
+ "learning_rate": 4.140913261017382e-07,
246
+ "loss": 1.0626,
247
+ "step": 165
248
+ },
249
+ {
250
+ "epoch": 0.35564853556485354,
251
+ "grad_norm": 3.5448336222698997,
252
+ "learning_rate": 4.070934040463998e-07,
253
+ "loss": 1.0787,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.36610878661087864,
258
+ "grad_norm": 2.917615696290126,
259
+ "learning_rate": 3.9988587174999306e-07,
260
+ "loss": 1.0624,
261
+ "step": 175
262
+ },
263
+ {
264
+ "epoch": 0.37656903765690375,
265
+ "grad_norm": 3.333598035642462,
266
+ "learning_rate": 3.9247834624635404e-07,
267
+ "loss": 1.0995,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.38702928870292885,
272
+ "grad_norm": 3.128496425971326,
273
+ "learning_rate": 3.848807114209074e-07,
274
+ "loss": 1.0672,
275
+ "step": 185
276
+ },
277
+ {
278
+ "epoch": 0.39748953974895396,
279
+ "grad_norm": 3.178190677786049,
280
+ "learning_rate": 3.7710310482256523e-07,
281
+ "loss": 1.086,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.40794979079497906,
286
+ "grad_norm": 3.5211557677293204,
287
+ "learning_rate": 3.691559041371631e-07,
288
+ "loss": 1.0735,
289
+ "step": 195
290
+ },
291
+ {
292
+ "epoch": 0.41841004184100417,
293
+ "grad_norm": 5.250340368942305,
294
+ "learning_rate": 3.610497133404795e-07,
295
+ "loss": 1.084,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.42887029288702927,
300
+ "grad_norm": 3.0435886010974325,
301
+ "learning_rate": 3.5279534854931674e-07,
302
+ "loss": 1.0937,
303
+ "step": 205
304
+ },
305
+ {
306
+ "epoch": 0.4393305439330544,
307
+ "grad_norm": 3.155631541031488,
308
+ "learning_rate": 3.4440382358952115e-07,
309
+ "loss": 1.0526,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.4497907949790795,
314
+ "grad_norm": 3.1856253958176373,
315
+ "learning_rate": 3.3588633530019866e-07,
316
+ "loss": 1.0744,
317
+ "step": 215
318
+ },
319
+ {
320
+ "epoch": 0.4602510460251046,
321
+ "grad_norm": 3.2478748519097516,
322
+ "learning_rate": 3.272542485937368e-07,
323
+ "loss": 1.0996,
324
+ "step": 220
325
+ },
326
+ {
327
+ "epoch": 0.4707112970711297,
328
+ "grad_norm": 3.5150799679920652,
329
+ "learning_rate": 3.185190812915646e-07,
330
+ "loss": 1.0786,
331
+ "step": 225
332
+ },
333
+ {
334
+ "epoch": 0.4811715481171548,
335
+ "grad_norm": 3.2027896128694904,
336
+ "learning_rate": 3.096924887558854e-07,
337
+ "loss": 1.0722,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 0.4916317991631799,
342
+ "grad_norm": 3.0097866657045333,
343
+ "learning_rate": 3.007862483378906e-07,
344
+ "loss": 1.0753,
345
+ "step": 235
346
+ },
347
+ {
348
+ "epoch": 0.502092050209205,
349
+ "grad_norm": 2.8959084947264464,
350
+ "learning_rate": 2.9181224366319943e-07,
351
+ "loss": 1.0898,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 0.5125523012552301,
356
+ "grad_norm": 3.1849250279587555,
357
+ "learning_rate": 2.827824487755007e-07,
358
+ "loss": 1.1032,
359
+ "step": 245
360
+ },
361
+ {
362
+ "epoch": 0.5230125523012552,
363
+ "grad_norm": 3.151630569719264,
364
+ "learning_rate": 2.7370891215954565e-07,
365
+ "loss": 1.0704,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 0.5334728033472803,
370
+ "grad_norm": 3.2021502155528534,
371
+ "learning_rate": 2.646037406648165e-07,
372
+ "loss": 1.0897,
373
+ "step": 255
374
+ },
375
+ {
376
+ "epoch": 0.5439330543933054,
377
+ "grad_norm": 3.281246167153216,
378
+ "learning_rate": 2.55479083351317e-07,
379
+ "loss": 1.0572,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 0.5543933054393305,
384
+ "grad_norm": 3.286796060289107,
385
+ "learning_rate": 2.463471152790427e-07,
386
+ "loss": 1.0995,
387
+ "step": 265
388
+ },
389
+ {
390
+ "epoch": 0.5648535564853556,
391
+ "grad_norm": 3.161375590564493,
392
+ "learning_rate": 2.3722002126275822e-07,
393
+ "loss": 1.1033,
394
+ "step": 270
395
+ },
396
+ {
397
+ "epoch": 0.5753138075313807,
398
+ "grad_norm": 3.2395710239773834,
399
+ "learning_rate": 2.2810997961375938e-07,
400
+ "loss": 1.0621,
401
+ "step": 275
402
+ },
403
+ {
404
+ "epoch": 0.5857740585774058,
405
+ "grad_norm": 3.246465238459929,
406
+ "learning_rate": 2.19029145890313e-07,
407
+ "loss": 1.0814,
408
+ "step": 280
409
+ },
410
+ {
411
+ "epoch": 0.5962343096234309,
412
+ "grad_norm": 3.2179889434846087,
413
+ "learning_rate": 2.0998963667845536e-07,
414
+ "loss": 1.0905,
415
+ "step": 285
416
+ },
417
+ {
418
+ "epoch": 0.606694560669456,
419
+ "grad_norm": 3.1419612482251154,
420
+ "learning_rate": 2.0100351342479216e-07,
421
+ "loss": 1.0668,
422
+ "step": 290
423
+ },
424
+ {
425
+ "epoch": 0.6171548117154811,
426
+ "grad_norm": 3.5676672142270487,
427
+ "learning_rate": 1.920827663428714e-07,
428
+ "loss": 1.056,
429
+ "step": 295
430
+ },
431
+ {
432
+ "epoch": 0.6276150627615062,
433
+ "grad_norm": 3.1612890211744267,
434
+ "learning_rate": 1.8323929841460178e-07,
435
+ "loss": 1.095,
436
+ "step": 300
437
+ },
438
+ {
439
+ "epoch": 0.6380753138075314,
440
+ "grad_norm": 3.2825957498988956,
441
+ "learning_rate": 1.7448490950806548e-07,
442
+ "loss": 1.0861,
443
+ "step": 305
444
+ },
445
+ {
446
+ "epoch": 0.6485355648535565,
447
+ "grad_norm": 3.726600439972812,
448
+ "learning_rate": 1.6583128063291573e-07,
449
+ "loss": 1.0501,
450
+ "step": 310
451
+ },
452
+ {
453
+ "epoch": 0.6589958158995816,
454
+ "grad_norm": 3.1847430333577544,
455
+ "learning_rate": 1.572899583543671e-07,
456
+ "loss": 1.0644,
457
+ "step": 315
458
+ },
459
+ {
460
+ "epoch": 0.6694560669456067,
461
+ "grad_norm": 3.229050818238049,
462
+ "learning_rate": 1.488723393865766e-07,
463
+ "loss": 1.0764,
464
+ "step": 320
465
+ },
466
+ {
467
+ "epoch": 0.6799163179916318,
468
+ "grad_norm": 3.0713199002348155,
469
+ "learning_rate": 1.4058965538597032e-07,
470
+ "loss": 1.0416,
471
+ "step": 325
472
+ },
473
+ {
474
+ "epoch": 0.6903765690376569,
475
+ "grad_norm": 3.357568728209587,
476
+ "learning_rate": 1.3245295796480788e-07,
477
+ "loss": 1.0694,
478
+ "step": 330
479
+ },
480
+ {
481
+ "epoch": 0.700836820083682,
482
+ "grad_norm": 3.2375362653987705,
483
+ "learning_rate": 1.2447310394498017e-07,
484
+ "loss": 1.0341,
485
+ "step": 335
486
+ },
487
+ {
488
+ "epoch": 0.7112970711297071,
489
+ "grad_norm": 3.2586888391876148,
490
+ "learning_rate": 1.1666074087171627e-07,
491
+ "loss": 1.0602,
492
+ "step": 340
493
+ },
494
+ {
495
+ "epoch": 0.7217573221757322,
496
+ "grad_norm": 3.3346267359627126,
497
+ "learning_rate": 1.090262928065293e-07,
498
+ "loss": 1.0648,
499
+ "step": 345
500
+ },
501
+ {
502
+ "epoch": 0.7322175732217573,
503
+ "grad_norm": 2.8836656878180857,
504
+ "learning_rate": 1.0157994641835734e-07,
505
+ "loss": 1.0398,
506
+ "step": 350
507
+ },
508
+ {
509
+ "epoch": 0.7426778242677824,
510
+ "grad_norm": 3.206673738900126,
511
+ "learning_rate": 9.433163739145771e-08,
512
+ "loss": 1.0688,
513
+ "step": 355
514
+ },
515
+ {
516
+ "epoch": 0.7531380753138075,
517
+ "grad_norm": 3.2095084267239495,
518
+ "learning_rate": 8.729103716819111e-08,
519
+ "loss": 1.0279,
520
+ "step": 360
521
+ },
522
+ {
523
+ "epoch": 0.7635983263598326,
524
+ "grad_norm": 3.20440166136788,
525
+ "learning_rate": 8.046754004438428e-08,
526
+ "loss": 1.059,
527
+ "step": 365
528
+ },
529
+ {
530
+ "epoch": 0.7740585774058577,
531
+ "grad_norm": 3.742393968055389,
532
+ "learning_rate": 7.387025063449081e-08,
533
+ "loss": 1.0595,
534
+ "step": 370
535
+ },
536
+ {
537
+ "epoch": 0.7845188284518828,
538
+ "grad_norm": 3.2395714997445193,
539
+ "learning_rate": 6.75079717232744e-08,
540
+ "loss": 1.0486,
541
+ "step": 375
542
+ },
543
+ {
544
+ "epoch": 0.7949790794979079,
545
+ "grad_norm": 3.2720407044526403,
546
+ "learning_rate": 6.138919252022435e-08,
547
+ "loss": 1.0704,
548
+ "step": 380
549
+ },
550
+ {
551
+ "epoch": 0.805439330543933,
552
+ "grad_norm": 3.370063395554973,
553
+ "learning_rate": 5.552207733237543e-08,
554
+ "loss": 1.0606,
555
+ "step": 385
556
+ },
557
+ {
558
+ "epoch": 0.8158995815899581,
559
+ "grad_norm": 3.0968930333404625,
560
+ "learning_rate": 4.991445467064689e-08,
561
+ "loss": 1.0566,
562
+ "step": 390
563
+ },
564
+ {
565
+ "epoch": 0.8263598326359832,
566
+ "grad_norm": 3.3774763830658534,
567
+ "learning_rate": 4.4573806804234335e-08,
568
+ "loss": 1.0499,
569
+ "step": 395
570
+ },
571
+ {
572
+ "epoch": 0.8368200836820083,
573
+ "grad_norm": 3.44447850480557,
574
+ "learning_rate": 3.9507259776993954e-08,
575
+ "loss": 1.0559,
576
+ "step": 400
577
+ },
578
+ {
579
+ "epoch": 0.8472803347280334,
580
+ "grad_norm": 3.1498166707617594,
581
+ "learning_rate": 3.472157389913874e-08,
582
+ "loss": 1.0547,
583
+ "step": 405
584
+ },
585
+ {
586
+ "epoch": 0.8577405857740585,
587
+ "grad_norm": 3.3701661155234865,
588
+ "learning_rate": 3.022313472693447e-08,
589
+ "loss": 1.0985,
590
+ "step": 410
591
+ },
592
+ {
593
+ "epoch": 0.8682008368200836,
594
+ "grad_norm": 3.3374460611814154,
595
+ "learning_rate": 2.601794454243139e-08,
596
+ "loss": 1.0592,
597
+ "step": 415
598
+ },
599
+ {
600
+ "epoch": 0.8786610878661087,
601
+ "grad_norm": 3.390815082624942,
602
+ "learning_rate": 2.2111614344599684e-08,
603
+ "loss": 1.0762,
604
+ "step": 420
605
+ },
606
+ {
607
+ "epoch": 0.8891213389121339,
608
+ "grad_norm": 3.189149656101972,
609
+ "learning_rate": 1.850935636255496e-08,
610
+ "loss": 1.0932,
611
+ "step": 425
612
+ },
613
+ {
614
+ "epoch": 0.899581589958159,
615
+ "grad_norm": 3.9882552930751176,
616
+ "learning_rate": 1.521597710086439e-08,
617
+ "loss": 1.0775,
618
+ "step": 430
619
+ },
620
+ {
621
+ "epoch": 0.9100418410041841,
622
+ "grad_norm": 3.4652282672677934,
623
+ "learning_rate": 1.2235870926211616e-08,
624
+ "loss": 1.0556,
625
+ "step": 435
626
+ },
627
+ {
628
+ "epoch": 0.9205020920502092,
629
+ "grad_norm": 4.369109278161751,
630
+ "learning_rate": 9.57301420397924e-09,
631
+ "loss": 1.0415,
632
+ "step": 440
633
+ },
634
+ {
635
+ "epoch": 0.9309623430962343,
636
+ "grad_norm": 3.2834964218858356,
637
+ "learning_rate": 7.230959992571367e-09,
638
+ "loss": 1.0751,
639
+ "step": 445
640
+ },
641
+ {
642
+ "epoch": 0.9414225941422594,
643
+ "grad_norm": 3.300455182998976,
644
+ "learning_rate": 5.212833302556258e-09,
645
+ "loss": 1.0353,
646
+ "step": 450
647
+ },
648
+ {
649
+ "epoch": 0.9518828451882845,
650
+ "grad_norm": 3.201301345623655,
651
+ "learning_rate": 3.521326926954532e-09,
652
+ "loss": 1.0647,
653
+ "step": 455
654
+ },
655
+ {
656
+ "epoch": 0.9623430962343096,
657
+ "grad_norm": 3.3648524524406622,
658
+ "learning_rate": 2.158697848236607e-09,
659
+ "loss": 1.0874,
660
+ "step": 460
661
+ },
662
+ {
663
+ "epoch": 0.9728033472803347,
664
+ "grad_norm": 3.743838451577809,
665
+ "learning_rate": 1.126764226823812e-09,
666
+ "loss": 1.0479,
667
+ "step": 465
668
+ },
669
+ {
670
+ "epoch": 0.9832635983263598,
671
+ "grad_norm": 3.2562966396262167,
672
+ "learning_rate": 4.269029751107489e-10,
673
+ "loss": 1.0317,
674
+ "step": 470
675
+ },
676
+ {
677
+ "epoch": 0.9937238493723849,
678
+ "grad_norm": 3.1980061195336846,
679
+ "learning_rate": 6.004792024680294e-11,
680
+ "loss": 1.0685,
681
+ "step": 475
682
+ },
683
  {
684
  "epoch": 1.0,
685
+ "eval_loss": 1.0374830961227417,
686
+ "eval_runtime": 10.4617,
687
+ "eval_samples_per_second": 191.174,
688
+ "eval_steps_per_second": 3.059,
689
+ "step": 478
690
  },
691
  {
692
  "epoch": 1.0,
693
+ "step": 478,
694
+ "total_flos": 97516887736320.0,
695
+ "train_loss": 1.1027952946379593,
696
+ "train_runtime": 1452.993,
697
+ "train_samples_per_second": 42.075,
698
+ "train_steps_per_second": 0.329
699
  }
700
  ],
701
  "logging_steps": 5,
702
+ "max_steps": 478,
703
  "num_input_tokens_seen": 0,
704
  "num_train_epochs": 1,
705
  "save_steps": 100,
 
715
  "attributes": {}
716
  }
717
  },
718
+ "total_flos": 97516887736320.0,
719
  "train_batch_size": 16,
720
  "trial_name": null,
721
  "trial_params": null