Josephgflowers commited on
Commit
2c67120
1 Parent(s): be47a88

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +743 -225
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 8.2628573134848e+17,
4
- "train_loss": 0.7277259675255335,
5
- "train_runtime": 75113.2183,
6
- "train_samples": 65000,
7
- "train_samples_per_second": 0.865,
8
  "train_steps_per_second": 0.072
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 1.955920022807249e+18,
4
+ "train_loss": 0.8700304180128008,
5
+ "train_runtime": 179145.9612,
6
+ "train_samples": 153863,
7
+ "train_samples_per_second": 0.859,
8
  "train_steps_per_second": 0.072
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 8.2628573134848e+17,
4
- "train_loss": 0.7277259675255335,
5
- "train_runtime": 75113.2183,
6
- "train_samples": 65000,
7
- "train_samples_per_second": 0.865,
8
  "train_steps_per_second": 0.072
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 1.955920022807249e+18,
4
+ "train_loss": 0.8700304180128008,
5
+ "train_runtime": 179145.9612,
6
+ "train_samples": 153863,
7
+ "train_samples_per_second": 0.859,
8
  "train_steps_per_second": 0.072
9
  }
trainer_state.json CHANGED
@@ -3,404 +3,922 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 5417,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.018460402436773122,
13
- "grad_norm": 72293.6015625,
14
- "learning_rate": 4.907697987816135e-05,
15
- "loss": 0.7984,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.036920804873546244,
20
- "grad_norm": 86618.71875,
21
- "learning_rate": 4.815395975632269e-05,
22
- "loss": 0.7874,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.05538120731031936,
27
- "grad_norm": 82291.1953125,
28
- "learning_rate": 4.723093963448403e-05,
29
- "loss": 0.7683,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.07384160974709249,
34
- "grad_norm": 72406.7265625,
35
- "learning_rate": 4.6307919512645376e-05,
36
- "loss": 0.7721,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.0923020121838656,
41
- "grad_norm": 82859.0,
42
- "learning_rate": 4.538489939080672e-05,
43
- "loss": 0.7672,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 0.11076241462063872,
48
- "grad_norm": 72674.453125,
49
- "learning_rate": 4.4461879268968066e-05,
50
- "loss": 0.7741,
51
  "step": 600
52
  },
53
  {
54
- "epoch": 0.12922281705741184,
55
- "grad_norm": 74598.8828125,
56
- "learning_rate": 4.3538859147129405e-05,
57
- "loss": 0.7684,
58
  "step": 700
59
  },
60
  {
61
- "epoch": 0.14768321949418498,
62
- "grad_norm": 75325.09375,
63
- "learning_rate": 4.261583902529076e-05,
64
- "loss": 0.7706,
65
  "step": 800
66
  },
67
  {
68
- "epoch": 0.1661436219309581,
69
- "grad_norm": 80050.4921875,
70
- "learning_rate": 4.1692818903452095e-05,
71
- "loss": 0.771,
72
  "step": 900
73
  },
74
  {
75
- "epoch": 0.1846040243677312,
76
- "grad_norm": 71205.3359375,
77
- "learning_rate": 4.076979878161344e-05,
78
- "loss": 0.7458,
79
  "step": 1000
80
  },
81
  {
82
- "epoch": 0.20306442680450434,
83
- "grad_norm": 71098.4765625,
84
- "learning_rate": 3.9846778659774785e-05,
85
- "loss": 0.7632,
86
  "step": 1100
87
  },
88
  {
89
- "epoch": 0.22152482924127745,
90
- "grad_norm": 79350.5859375,
91
- "learning_rate": 3.892375853793613e-05,
92
- "loss": 0.7715,
93
  "step": 1200
94
  },
95
  {
96
- "epoch": 0.23998523167805058,
97
- "grad_norm": 78053.2578125,
98
- "learning_rate": 3.800073841609747e-05,
99
- "loss": 0.7582,
100
  "step": 1300
101
  },
102
  {
103
- "epoch": 0.2584456341148237,
104
- "grad_norm": 72609.0078125,
105
- "learning_rate": 3.707771829425882e-05,
106
- "loss": 0.7547,
107
  "step": 1400
108
  },
109
  {
110
- "epoch": 0.27690603655159685,
111
- "grad_norm": 75624.9453125,
112
- "learning_rate": 3.615469817242016e-05,
113
- "loss": 0.7432,
114
  "step": 1500
115
  },
116
  {
117
- "epoch": 0.29536643898836995,
118
- "grad_norm": 67118.828125,
119
- "learning_rate": 3.5231678050581504e-05,
120
- "loss": 0.7522,
121
  "step": 1600
122
  },
123
  {
124
- "epoch": 0.31382684142514305,
125
- "grad_norm": 77685.6171875,
126
- "learning_rate": 3.430865792874285e-05,
127
- "loss": 0.7529,
128
  "step": 1700
129
  },
130
  {
131
- "epoch": 0.3322872438619162,
132
- "grad_norm": 80151.4609375,
133
- "learning_rate": 3.338563780690419e-05,
134
- "loss": 0.7558,
135
  "step": 1800
136
  },
137
  {
138
- "epoch": 0.3507476462986893,
139
- "grad_norm": 68374.609375,
140
- "learning_rate": 3.246261768506554e-05,
141
- "loss": 0.7547,
142
  "step": 1900
143
  },
144
  {
145
- "epoch": 0.3692080487354624,
146
- "grad_norm": 69225.65625,
147
- "learning_rate": 3.153959756322688e-05,
148
- "loss": 0.7283,
149
  "step": 2000
150
  },
151
  {
152
- "epoch": 0.38766845117223553,
153
- "grad_norm": 93264.3984375,
154
- "learning_rate": 3.061657744138822e-05,
155
- "loss": 0.7368,
156
  "step": 2100
157
  },
158
  {
159
- "epoch": 0.4061288536090087,
160
- "grad_norm": 81487.3125,
161
- "learning_rate": 2.9693557319549568e-05,
162
- "loss": 0.7531,
163
  "step": 2200
164
  },
165
  {
166
- "epoch": 0.4245892560457818,
167
- "grad_norm": 72823.8125,
168
- "learning_rate": 2.8770537197710913e-05,
169
- "loss": 0.7216,
170
  "step": 2300
171
  },
172
  {
173
- "epoch": 0.4430496584825549,
174
- "grad_norm": 73118.8203125,
175
- "learning_rate": 2.7847517075872255e-05,
176
- "loss": 0.7356,
177
  "step": 2400
178
  },
179
  {
180
- "epoch": 0.46151006091932806,
181
- "grad_norm": 68678.484375,
182
- "learning_rate": 2.69244969540336e-05,
183
- "loss": 0.7308,
184
  "step": 2500
185
  },
186
  {
187
- "epoch": 0.47997046335610116,
188
- "grad_norm": 73918.640625,
189
- "learning_rate": 2.6001476832194942e-05,
190
- "loss": 0.7235,
191
  "step": 2600
192
  },
193
  {
194
- "epoch": 0.49843086579287427,
195
- "grad_norm": 66172.796875,
196
- "learning_rate": 2.5078456710356284e-05,
197
- "loss": 0.7283,
198
  "step": 2700
199
  },
200
  {
201
- "epoch": 0.5168912682296474,
202
- "grad_norm": 66544.1015625,
203
- "learning_rate": 2.415543658851763e-05,
204
- "loss": 0.7228,
205
  "step": 2800
206
  },
207
  {
208
- "epoch": 0.5353516706664205,
209
- "grad_norm": 66939.625,
210
- "learning_rate": 2.3232416466678974e-05,
211
- "loss": 0.7095,
212
  "step": 2900
213
  },
214
  {
215
- "epoch": 0.5538120731031937,
216
- "grad_norm": 67865.15625,
217
- "learning_rate": 2.230939634484032e-05,
218
- "loss": 0.7248,
219
  "step": 3000
220
  },
221
  {
222
- "epoch": 0.5722724755399667,
223
- "grad_norm": 63117.76953125,
224
- "learning_rate": 2.138637622300166e-05,
225
- "loss": 0.7147,
226
  "step": 3100
227
  },
228
  {
229
- "epoch": 0.5907328779767399,
230
- "grad_norm": 59398.1796875,
231
- "learning_rate": 2.0463356101163006e-05,
232
- "loss": 0.7031,
233
  "step": 3200
234
  },
235
  {
236
- "epoch": 0.6091932804135131,
237
- "grad_norm": 63816.08203125,
238
- "learning_rate": 1.954033597932435e-05,
239
- "loss": 0.7144,
240
  "step": 3300
241
  },
242
  {
243
- "epoch": 0.6276536828502861,
244
- "grad_norm": 70370.5625,
245
- "learning_rate": 1.8617315857485696e-05,
246
- "loss": 0.7141,
247
  "step": 3400
248
  },
249
  {
250
- "epoch": 0.6461140852870593,
251
- "grad_norm": 63610.734375,
252
- "learning_rate": 1.7694295735647038e-05,
253
- "loss": 0.694,
254
  "step": 3500
255
  },
256
  {
257
- "epoch": 0.6645744877238324,
258
- "grad_norm": 59664.05078125,
259
- "learning_rate": 1.6771275613808383e-05,
260
- "loss": 0.7086,
261
  "step": 3600
262
  },
263
  {
264
- "epoch": 0.6830348901606055,
265
- "grad_norm": 64207.88671875,
266
- "learning_rate": 1.5848255491969728e-05,
267
- "loss": 0.7138,
268
  "step": 3700
269
  },
270
  {
271
- "epoch": 0.7014952925973786,
272
- "grad_norm": 79748.2734375,
273
- "learning_rate": 1.4925235370131068e-05,
274
- "loss": 0.6985,
275
  "step": 3800
276
  },
277
  {
278
- "epoch": 0.7199556950341518,
279
- "grad_norm": 72216.671875,
280
- "learning_rate": 1.4002215248292413e-05,
281
- "loss": 0.7019,
282
  "step": 3900
283
  },
284
  {
285
- "epoch": 0.7384160974709248,
286
- "grad_norm": 74815.6328125,
287
- "learning_rate": 1.3079195126453758e-05,
288
- "loss": 0.687,
289
  "step": 4000
290
  },
291
  {
292
- "epoch": 0.756876499907698,
293
- "grad_norm": 61482.41796875,
294
- "learning_rate": 1.2156175004615102e-05,
295
- "loss": 0.6983,
296
  "step": 4100
297
  },
298
  {
299
- "epoch": 0.7753369023444711,
300
- "grad_norm": 73800.75,
301
- "learning_rate": 1.1233154882776445e-05,
302
- "loss": 0.7004,
303
  "step": 4200
304
  },
305
  {
306
- "epoch": 0.7937973047812442,
307
- "grad_norm": 75985.421875,
308
- "learning_rate": 1.0310134760937789e-05,
309
- "loss": 0.7097,
310
  "step": 4300
311
  },
312
  {
313
- "epoch": 0.8122577072180174,
314
- "grad_norm": 67175.03125,
315
- "learning_rate": 9.387114639099132e-06,
316
- "loss": 0.6781,
317
  "step": 4400
318
  },
319
  {
320
- "epoch": 0.8307181096547904,
321
- "grad_norm": 68520.875,
322
- "learning_rate": 8.464094517260476e-06,
323
- "loss": 0.6857,
324
  "step": 4500
325
  },
326
  {
327
- "epoch": 0.8491785120915636,
328
- "grad_norm": 65612.703125,
329
- "learning_rate": 7.541074395421821e-06,
330
- "loss": 0.6971,
331
  "step": 4600
332
  },
333
  {
334
- "epoch": 0.8676389145283367,
335
- "grad_norm": 63280.83203125,
336
- "learning_rate": 6.618054273583164e-06,
337
- "loss": 0.6942,
338
  "step": 4700
339
  },
340
  {
341
- "epoch": 0.8860993169651098,
342
- "grad_norm": 68478.2890625,
343
- "learning_rate": 5.6950341517445085e-06,
344
- "loss": 0.7028,
345
  "step": 4800
346
  },
347
  {
348
- "epoch": 0.904559719401883,
349
- "grad_norm": 68026.8125,
350
- "learning_rate": 4.772014029905853e-06,
351
- "loss": 0.6694,
352
  "step": 4900
353
  },
354
  {
355
- "epoch": 0.9230201218386561,
356
- "grad_norm": 64797.25390625,
357
- "learning_rate": 3.848993908067195e-06,
358
- "loss": 0.704,
359
  "step": 5000
360
  },
361
  {
362
- "epoch": 0.9414805242754292,
363
- "grad_norm": 71872.890625,
364
- "learning_rate": 2.9259737862285397e-06,
365
- "loss": 0.6946,
366
  "step": 5100
367
  },
368
  {
369
- "epoch": 0.9599409267122023,
370
- "grad_norm": 74217.3125,
371
- "learning_rate": 2.002953664389884e-06,
372
- "loss": 0.6796,
373
  "step": 5200
374
  },
375
  {
376
- "epoch": 0.9784013291489755,
377
- "grad_norm": 68286.15625,
378
- "learning_rate": 1.0799335425512278e-06,
379
- "loss": 0.6966,
380
  "step": 5300
381
  },
382
  {
383
- "epoch": 0.9968617315857485,
384
- "grad_norm": 137991.28125,
385
- "learning_rate": 1.5691342071257153e-07,
386
- "loss": 0.7014,
387
  "step": 5400
388
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  {
390
  "epoch": 1.0,
391
- "step": 5417,
392
- "total_flos": 8.2628573134848e+17,
393
- "train_loss": 0.7277259675255335,
394
- "train_runtime": 75113.2183,
395
- "train_samples_per_second": 0.865,
396
  "train_steps_per_second": 0.072
397
  }
398
  ],
399
  "logging_steps": 100,
400
- "max_steps": 5417,
401
  "num_input_tokens_seen": 0,
402
  "num_train_epochs": 1,
403
- "save_steps": 5417,
404
  "stateful_callbacks": {
405
  "TrainerControl": {
406
  "args": {
@@ -413,7 +931,7 @@
413
  "attributes": {}
414
  }
415
  },
416
- "total_flos": 8.2628573134848e+17,
417
  "train_batch_size": 12,
418
  "trial_name": null,
419
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 12822,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.007799095304944627,
13
+ "grad_norm": 63896.45703125,
14
+ "learning_rate": 4.961004523475277e-05,
15
+ "loss": 0.929,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.015598190609889253,
20
+ "grad_norm": 79284.765625,
21
+ "learning_rate": 4.9220090469505544e-05,
22
+ "loss": 0.9281,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.02339728591483388,
27
+ "grad_norm": 80169.203125,
28
+ "learning_rate": 4.883013570425831e-05,
29
+ "loss": 0.9385,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.031196381219778507,
34
+ "grad_norm": 65649.6796875,
35
+ "learning_rate": 4.844018093901108e-05,
36
+ "loss": 0.9454,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.038995476524723134,
41
+ "grad_norm": 82255.6953125,
42
+ "learning_rate": 4.805022617376384e-05,
43
+ "loss": 0.9368,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 0.04679457182966776,
48
+ "grad_norm": 78376.4296875,
49
+ "learning_rate": 4.7660271408516614e-05,
50
+ "loss": 0.9318,
51
  "step": 600
52
  },
53
  {
54
+ "epoch": 0.05459366713461238,
55
+ "grad_norm": 92200.078125,
56
+ "learning_rate": 4.7270316643269384e-05,
57
+ "loss": 0.9272,
58
  "step": 700
59
  },
60
  {
61
+ "epoch": 0.06239276243955701,
62
+ "grad_norm": 88060.359375,
63
+ "learning_rate": 4.688036187802215e-05,
64
+ "loss": 0.9097,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 0.07019185774450164,
69
+ "grad_norm": 76161.1796875,
70
+ "learning_rate": 4.649040711277492e-05,
71
+ "loss": 0.9475,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 0.07799095304944627,
76
+ "grad_norm": 90524.1796875,
77
+ "learning_rate": 4.610045234752769e-05,
78
+ "loss": 0.9481,
79
  "step": 1000
80
  },
81
  {
82
+ "epoch": 0.08579004835439089,
83
+ "grad_norm": 100124.8359375,
84
+ "learning_rate": 4.571049758228046e-05,
85
+ "loss": 0.9333,
86
  "step": 1100
87
  },
88
  {
89
+ "epoch": 0.09358914365933552,
90
+ "grad_norm": 65681.7734375,
91
+ "learning_rate": 4.5320542817033225e-05,
92
+ "loss": 0.9247,
93
  "step": 1200
94
  },
95
  {
96
+ "epoch": 0.10138823896428015,
97
+ "grad_norm": 76333.234375,
98
+ "learning_rate": 4.4930588051785995e-05,
99
+ "loss": 0.9098,
100
  "step": 1300
101
  },
102
  {
103
+ "epoch": 0.10918733426922476,
104
+ "grad_norm": 77002.0703125,
105
+ "learning_rate": 4.4540633286538766e-05,
106
+ "loss": 0.9034,
107
  "step": 1400
108
  },
109
  {
110
+ "epoch": 0.1169864295741694,
111
+ "grad_norm": 72669.1875,
112
+ "learning_rate": 4.415067852129153e-05,
113
+ "loss": 0.9149,
114
  "step": 1500
115
  },
116
  {
117
+ "epoch": 0.12478552487911403,
118
+ "grad_norm": 71785.34375,
119
+ "learning_rate": 4.37607237560443e-05,
120
+ "loss": 0.9051,
121
  "step": 1600
122
  },
123
  {
124
+ "epoch": 0.13258462018405864,
125
+ "grad_norm": 72728.5859375,
126
+ "learning_rate": 4.3370768990797065e-05,
127
+ "loss": 0.9268,
128
  "step": 1700
129
  },
130
  {
131
+ "epoch": 0.14038371548900327,
132
+ "grad_norm": 69223.3203125,
133
+ "learning_rate": 4.2980814225549836e-05,
134
+ "loss": 0.9294,
135
  "step": 1800
136
  },
137
  {
138
+ "epoch": 0.1481828107939479,
139
+ "grad_norm": 62745.203125,
140
+ "learning_rate": 4.2590859460302607e-05,
141
+ "loss": 0.9038,
142
  "step": 1900
143
  },
144
  {
145
+ "epoch": 0.15598190609889254,
146
+ "grad_norm": 79897.8671875,
147
+ "learning_rate": 4.220090469505538e-05,
148
+ "loss": 0.8994,
149
  "step": 2000
150
  },
151
  {
152
+ "epoch": 0.16378100140383717,
153
+ "grad_norm": 74131.546875,
154
+ "learning_rate": 4.181094992980815e-05,
155
+ "loss": 0.9144,
156
  "step": 2100
157
  },
158
  {
159
+ "epoch": 0.17158009670878177,
160
+ "grad_norm": 67760.7109375,
161
+ "learning_rate": 4.142099516456091e-05,
162
+ "loss": 0.9285,
163
  "step": 2200
164
  },
165
  {
166
+ "epoch": 0.1793791920137264,
167
+ "grad_norm": 78868.1484375,
168
+ "learning_rate": 4.103104039931368e-05,
169
+ "loss": 0.8973,
170
  "step": 2300
171
  },
172
  {
173
+ "epoch": 0.18717828731867103,
174
+ "grad_norm": 71780.2109375,
175
+ "learning_rate": 4.064108563406645e-05,
176
+ "loss": 0.8961,
177
  "step": 2400
178
  },
179
  {
180
+ "epoch": 0.19497738262361566,
181
+ "grad_norm": 77858.109375,
182
+ "learning_rate": 4.025113086881922e-05,
183
+ "loss": 0.9171,
184
  "step": 2500
185
  },
186
  {
187
+ "epoch": 0.2027764779285603,
188
+ "grad_norm": 70062.796875,
189
+ "learning_rate": 3.986117610357199e-05,
190
+ "loss": 0.8903,
191
  "step": 2600
192
  },
193
  {
194
+ "epoch": 0.21057557323350493,
195
+ "grad_norm": 171091.046875,
196
+ "learning_rate": 3.947122133832475e-05,
197
+ "loss": 0.9061,
198
  "step": 2700
199
  },
200
  {
201
+ "epoch": 0.21837466853844953,
202
+ "grad_norm": 84371.34375,
203
+ "learning_rate": 3.908126657307752e-05,
204
+ "loss": 0.9315,
205
  "step": 2800
206
  },
207
  {
208
+ "epoch": 0.22617376384339416,
209
+ "grad_norm": 62273.2578125,
210
+ "learning_rate": 3.8691311807830294e-05,
211
+ "loss": 0.9115,
212
  "step": 2900
213
  },
214
  {
215
+ "epoch": 0.2339728591483388,
216
+ "grad_norm": 60464.15625,
217
+ "learning_rate": 3.8301357042583065e-05,
218
+ "loss": 0.8888,
219
  "step": 3000
220
  },
221
  {
222
+ "epoch": 0.24177195445328342,
223
+ "grad_norm": 67546.828125,
224
+ "learning_rate": 3.7911402277335836e-05,
225
+ "loss": 0.9031,
226
  "step": 3100
227
  },
228
  {
229
+ "epoch": 0.24957104975822805,
230
+ "grad_norm": 75904.6328125,
231
+ "learning_rate": 3.75214475120886e-05,
232
+ "loss": 0.9146,
233
  "step": 3200
234
  },
235
  {
236
+ "epoch": 0.2573701450631727,
237
+ "grad_norm": 81015.3828125,
238
+ "learning_rate": 3.713149274684137e-05,
239
+ "loss": 0.8935,
240
  "step": 3300
241
  },
242
  {
243
+ "epoch": 0.2651692403681173,
244
+ "grad_norm": 69763.3203125,
245
+ "learning_rate": 3.6741537981594134e-05,
246
+ "loss": 0.9129,
247
  "step": 3400
248
  },
249
  {
250
+ "epoch": 0.27296833567306195,
251
+ "grad_norm": 68604.5,
252
+ "learning_rate": 3.6351583216346905e-05,
253
+ "loss": 0.8834,
254
  "step": 3500
255
  },
256
  {
257
+ "epoch": 0.28076743097800655,
258
+ "grad_norm": 66758.03125,
259
+ "learning_rate": 3.596162845109967e-05,
260
+ "loss": 0.8925,
261
  "step": 3600
262
  },
263
  {
264
+ "epoch": 0.28856652628295115,
265
+ "grad_norm": 72424.8984375,
266
+ "learning_rate": 3.557167368585244e-05,
267
+ "loss": 0.915,
268
  "step": 3700
269
  },
270
  {
271
+ "epoch": 0.2963656215878958,
272
+ "grad_norm": 65904.9453125,
273
+ "learning_rate": 3.518171892060521e-05,
274
+ "loss": 0.8833,
275
  "step": 3800
276
  },
277
  {
278
+ "epoch": 0.3041647168928404,
279
+ "grad_norm": 62548.9453125,
280
+ "learning_rate": 3.479176415535798e-05,
281
+ "loss": 0.8991,
282
  "step": 3900
283
  },
284
  {
285
+ "epoch": 0.3119638121977851,
286
+ "grad_norm": 85173.90625,
287
+ "learning_rate": 3.440180939011075e-05,
288
+ "loss": 0.9111,
289
  "step": 4000
290
  },
291
  {
292
+ "epoch": 0.3197629075027297,
293
+ "grad_norm": 65947.765625,
294
+ "learning_rate": 3.4011854624863516e-05,
295
+ "loss": 0.8774,
296
  "step": 4100
297
  },
298
  {
299
+ "epoch": 0.32756200280767434,
300
+ "grad_norm": 69672.1875,
301
+ "learning_rate": 3.362189985961629e-05,
302
+ "loss": 0.8816,
303
  "step": 4200
304
  },
305
  {
306
+ "epoch": 0.33536109811261894,
307
+ "grad_norm": 68125.9609375,
308
+ "learning_rate": 3.323194509436906e-05,
309
+ "loss": 0.8784,
310
  "step": 4300
311
  },
312
  {
313
+ "epoch": 0.34316019341756354,
314
+ "grad_norm": 76767.6796875,
315
+ "learning_rate": 3.284199032912182e-05,
316
+ "loss": 0.899,
317
  "step": 4400
318
  },
319
  {
320
+ "epoch": 0.3509592887225082,
321
+ "grad_norm": 64417.96484375,
322
+ "learning_rate": 3.245203556387459e-05,
323
+ "loss": 0.8944,
324
  "step": 4500
325
  },
326
  {
327
+ "epoch": 0.3587583840274528,
328
+ "grad_norm": 62896.37109375,
329
+ "learning_rate": 3.206208079862736e-05,
330
+ "loss": 0.8745,
331
  "step": 4600
332
  },
333
  {
334
+ "epoch": 0.36655747933239746,
335
+ "grad_norm": 58443.88671875,
336
+ "learning_rate": 3.167212603338013e-05,
337
+ "loss": 0.853,
338
  "step": 4700
339
  },
340
  {
341
+ "epoch": 0.37435657463734207,
342
+ "grad_norm": 75019.8515625,
343
+ "learning_rate": 3.12821712681329e-05,
344
+ "loss": 0.8728,
345
  "step": 4800
346
  },
347
  {
348
+ "epoch": 0.38215566994228667,
349
+ "grad_norm": 82645.0078125,
350
+ "learning_rate": 3.089221650288567e-05,
351
+ "loss": 0.8617,
352
  "step": 4900
353
  },
354
  {
355
+ "epoch": 0.3899547652472313,
356
+ "grad_norm": 57014.08984375,
357
+ "learning_rate": 3.0502261737638436e-05,
358
+ "loss": 0.8566,
359
  "step": 5000
360
  },
361
  {
362
+ "epoch": 0.39775386055217593,
363
+ "grad_norm": 71623.7578125,
364
+ "learning_rate": 3.01123069723912e-05,
365
+ "loss": 0.879,
366
  "step": 5100
367
  },
368
  {
369
+ "epoch": 0.4055529558571206,
370
+ "grad_norm": 75911.140625,
371
+ "learning_rate": 2.972235220714397e-05,
372
+ "loss": 0.8889,
373
  "step": 5200
374
  },
375
  {
376
+ "epoch": 0.4133520511620652,
377
+ "grad_norm": 69251.0703125,
378
+ "learning_rate": 2.9332397441896742e-05,
379
+ "loss": 0.8685,
380
  "step": 5300
381
  },
382
  {
383
+ "epoch": 0.42115114646700985,
384
+ "grad_norm": 72434.6484375,
385
+ "learning_rate": 2.8942442676649513e-05,
386
+ "loss": 0.8895,
387
  "step": 5400
388
  },
389
+ {
390
+ "epoch": 0.42895024177195445,
391
+ "grad_norm": 69133.78125,
392
+ "learning_rate": 2.855248791140228e-05,
393
+ "loss": 0.8845,
394
+ "step": 5500
395
+ },
396
+ {
397
+ "epoch": 0.43674933707689906,
398
+ "grad_norm": 67216.6875,
399
+ "learning_rate": 2.8162533146155044e-05,
400
+ "loss": 0.8661,
401
+ "step": 5600
402
+ },
403
+ {
404
+ "epoch": 0.4445484323818437,
405
+ "grad_norm": 61408.3515625,
406
+ "learning_rate": 2.7772578380907815e-05,
407
+ "loss": 0.8743,
408
+ "step": 5700
409
+ },
410
+ {
411
+ "epoch": 0.4523475276867883,
412
+ "grad_norm": 66496.1640625,
413
+ "learning_rate": 2.7382623615660586e-05,
414
+ "loss": 0.8805,
415
+ "step": 5800
416
+ },
417
+ {
418
+ "epoch": 0.460146622991733,
419
+ "grad_norm": 62693.109375,
420
+ "learning_rate": 2.6992668850413357e-05,
421
+ "loss": 0.8634,
422
+ "step": 5900
423
+ },
424
+ {
425
+ "epoch": 0.4679457182966776,
426
+ "grad_norm": 77037.4140625,
427
+ "learning_rate": 2.660271408516612e-05,
428
+ "loss": 0.8834,
429
+ "step": 6000
430
+ },
431
+ {
432
+ "epoch": 0.4757448136016222,
433
+ "grad_norm": 67157.15625,
434
+ "learning_rate": 2.6212759319918888e-05,
435
+ "loss": 0.8993,
436
+ "step": 6100
437
+ },
438
+ {
439
+ "epoch": 0.48354390890656684,
440
+ "grad_norm": 77155.8125,
441
+ "learning_rate": 2.582280455467166e-05,
442
+ "loss": 0.854,
443
+ "step": 6200
444
+ },
445
+ {
446
+ "epoch": 0.49134300421151145,
447
+ "grad_norm": 65097.22265625,
448
+ "learning_rate": 2.543284978942443e-05,
449
+ "loss": 0.8716,
450
+ "step": 6300
451
+ },
452
+ {
453
+ "epoch": 0.4991420995164561,
454
+ "grad_norm": 89373.515625,
455
+ "learning_rate": 2.50428950241772e-05,
456
+ "loss": 0.873,
457
+ "step": 6400
458
+ },
459
+ {
460
+ "epoch": 0.5069411948214008,
461
+ "grad_norm": 65231.74609375,
462
+ "learning_rate": 2.4652940258929964e-05,
463
+ "loss": 0.8698,
464
+ "step": 6500
465
+ },
466
+ {
467
+ "epoch": 0.5147402901263454,
468
+ "grad_norm": 60228.23828125,
469
+ "learning_rate": 2.426298549368273e-05,
470
+ "loss": 0.8374,
471
+ "step": 6600
472
+ },
473
+ {
474
+ "epoch": 0.52253938543129,
475
+ "grad_norm": 73940.3125,
476
+ "learning_rate": 2.3873030728435502e-05,
477
+ "loss": 0.8669,
478
+ "step": 6700
479
+ },
480
+ {
481
+ "epoch": 0.5303384807362346,
482
+ "grad_norm": 80768.6953125,
483
+ "learning_rate": 2.3483075963188273e-05,
484
+ "loss": 0.8708,
485
+ "step": 6800
486
+ },
487
+ {
488
+ "epoch": 0.5381375760411792,
489
+ "grad_norm": 67096.984375,
490
+ "learning_rate": 2.309312119794104e-05,
491
+ "loss": 0.8797,
492
+ "step": 6900
493
+ },
494
+ {
495
+ "epoch": 0.5459366713461239,
496
+ "grad_norm": 56897.71875,
497
+ "learning_rate": 2.2703166432693808e-05,
498
+ "loss": 0.8728,
499
+ "step": 7000
500
+ },
501
+ {
502
+ "epoch": 0.5537357666510685,
503
+ "grad_norm": 64982.1015625,
504
+ "learning_rate": 2.2313211667446575e-05,
505
+ "loss": 0.8735,
506
+ "step": 7100
507
+ },
508
+ {
509
+ "epoch": 0.5615348619560131,
510
+ "grad_norm": 62320.35546875,
511
+ "learning_rate": 2.1923256902199346e-05,
512
+ "loss": 0.8519,
513
+ "step": 7200
514
+ },
515
+ {
516
+ "epoch": 0.5693339572609577,
517
+ "grad_norm": 74915.2578125,
518
+ "learning_rate": 2.1533302136952117e-05,
519
+ "loss": 0.849,
520
+ "step": 7300
521
+ },
522
+ {
523
+ "epoch": 0.5771330525659023,
524
+ "grad_norm": 69091.40625,
525
+ "learning_rate": 2.1143347371704884e-05,
526
+ "loss": 0.8451,
527
+ "step": 7400
528
+ },
529
+ {
530
+ "epoch": 0.584932147870847,
531
+ "grad_norm": 75015.2109375,
532
+ "learning_rate": 2.0753392606457652e-05,
533
+ "loss": 0.8757,
534
+ "step": 7500
535
+ },
536
+ {
537
+ "epoch": 0.5927312431757916,
538
+ "grad_norm": 59275.14453125,
539
+ "learning_rate": 2.036343784121042e-05,
540
+ "loss": 0.8657,
541
+ "step": 7600
542
+ },
543
+ {
544
+ "epoch": 0.6005303384807362,
545
+ "grad_norm": 64822.92578125,
546
+ "learning_rate": 1.997348307596319e-05,
547
+ "loss": 0.8555,
548
+ "step": 7700
549
+ },
550
+ {
551
+ "epoch": 0.6083294337856808,
552
+ "grad_norm": 63732.34765625,
553
+ "learning_rate": 1.9583528310715957e-05,
554
+ "loss": 0.8415,
555
+ "step": 7800
556
+ },
557
+ {
558
+ "epoch": 0.6161285290906254,
559
+ "grad_norm": 66452.859375,
560
+ "learning_rate": 1.9193573545468728e-05,
561
+ "loss": 0.8475,
562
+ "step": 7900
563
+ },
564
+ {
565
+ "epoch": 0.6239276243955701,
566
+ "grad_norm": 69227.2890625,
567
+ "learning_rate": 1.8803618780221496e-05,
568
+ "loss": 0.8333,
569
+ "step": 8000
570
+ },
571
+ {
572
+ "epoch": 0.6317267197005147,
573
+ "grad_norm": 67040.7578125,
574
+ "learning_rate": 1.8413664014974263e-05,
575
+ "loss": 0.8317,
576
+ "step": 8100
577
+ },
578
+ {
579
+ "epoch": 0.6395258150054594,
580
+ "grad_norm": 74801.2734375,
581
+ "learning_rate": 1.8023709249727034e-05,
582
+ "loss": 0.8726,
583
+ "step": 8200
584
+ },
585
+ {
586
+ "epoch": 0.647324910310404,
587
+ "grad_norm": 63112.703125,
588
+ "learning_rate": 1.76337544844798e-05,
589
+ "loss": 0.8442,
590
+ "step": 8300
591
+ },
592
+ {
593
+ "epoch": 0.6551240056153487,
594
+ "grad_norm": 81685.2265625,
595
+ "learning_rate": 1.724379971923257e-05,
596
+ "loss": 0.8316,
597
+ "step": 8400
598
+ },
599
+ {
600
+ "epoch": 0.6629231009202933,
601
+ "grad_norm": 69446.4921875,
602
+ "learning_rate": 1.6853844953985336e-05,
603
+ "loss": 0.8391,
604
+ "step": 8500
605
+ },
606
+ {
607
+ "epoch": 0.6707221962252379,
608
+ "grad_norm": 66903.671875,
609
+ "learning_rate": 1.6463890188738107e-05,
610
+ "loss": 0.8144,
611
+ "step": 8600
612
+ },
613
+ {
614
+ "epoch": 0.6785212915301825,
615
+ "grad_norm": 76092.6328125,
616
+ "learning_rate": 1.6073935423490877e-05,
617
+ "loss": 0.8441,
618
+ "step": 8700
619
+ },
620
+ {
621
+ "epoch": 0.6863203868351271,
622
+ "grad_norm": 71659.8515625,
623
+ "learning_rate": 1.5683980658243645e-05,
624
+ "loss": 0.8388,
625
+ "step": 8800
626
+ },
627
+ {
628
+ "epoch": 0.6941194821400718,
629
+ "grad_norm": 61097.1171875,
630
+ "learning_rate": 1.5294025892996412e-05,
631
+ "loss": 0.8383,
632
+ "step": 8900
633
+ },
634
+ {
635
+ "epoch": 0.7019185774450164,
636
+ "grad_norm": 60165.2734375,
637
+ "learning_rate": 1.4904071127749181e-05,
638
+ "loss": 0.8442,
639
+ "step": 9000
640
+ },
641
+ {
642
+ "epoch": 0.709717672749961,
643
+ "grad_norm": 71296.1171875,
644
+ "learning_rate": 1.451411636250195e-05,
645
+ "loss": 0.8537,
646
+ "step": 9100
647
+ },
648
+ {
649
+ "epoch": 0.7175167680549056,
650
+ "grad_norm": 67209.6328125,
651
+ "learning_rate": 1.4124161597254721e-05,
652
+ "loss": 0.8575,
653
+ "step": 9200
654
+ },
655
+ {
656
+ "epoch": 0.7253158633598502,
657
+ "grad_norm": 63280.3203125,
658
+ "learning_rate": 1.3734206832007487e-05,
659
+ "loss": 0.8358,
660
+ "step": 9300
661
+ },
662
+ {
663
+ "epoch": 0.7331149586647949,
664
+ "grad_norm": 69636.6875,
665
+ "learning_rate": 1.3344252066760258e-05,
666
+ "loss": 0.8475,
667
+ "step": 9400
668
+ },
669
+ {
670
+ "epoch": 0.7409140539697395,
671
+ "grad_norm": 68302.3828125,
672
+ "learning_rate": 1.2954297301513025e-05,
673
+ "loss": 0.8305,
674
+ "step": 9500
675
+ },
676
+ {
677
+ "epoch": 0.7487131492746841,
678
+ "grad_norm": 68986.78125,
679
+ "learning_rate": 1.2564342536265794e-05,
680
+ "loss": 0.8667,
681
+ "step": 9600
682
+ },
683
+ {
684
+ "epoch": 0.7565122445796287,
685
+ "grad_norm": 69115.0546875,
686
+ "learning_rate": 1.2174387771018563e-05,
687
+ "loss": 0.8386,
688
+ "step": 9700
689
+ },
690
+ {
691
+ "epoch": 0.7643113398845733,
692
+ "grad_norm": 58786.73046875,
693
+ "learning_rate": 1.178443300577133e-05,
694
+ "loss": 0.8306,
695
+ "step": 9800
696
+ },
697
+ {
698
+ "epoch": 0.772110435189518,
699
+ "grad_norm": 75233.265625,
700
+ "learning_rate": 1.13944782405241e-05,
701
+ "loss": 0.8611,
702
+ "step": 9900
703
+ },
704
+ {
705
+ "epoch": 0.7799095304944627,
706
+ "grad_norm": 65838.609375,
707
+ "learning_rate": 1.1004523475276867e-05,
708
+ "loss": 0.8191,
709
+ "step": 10000
710
+ },
711
+ {
712
+ "epoch": 0.7877086257994073,
713
+ "grad_norm": 55231.33984375,
714
+ "learning_rate": 1.0614568710029638e-05,
715
+ "loss": 0.8551,
716
+ "step": 10100
717
+ },
718
+ {
719
+ "epoch": 0.7955077211043519,
720
+ "grad_norm": 63516.97265625,
721
+ "learning_rate": 1.0224613944782405e-05,
722
+ "loss": 0.8685,
723
+ "step": 10200
724
+ },
725
+ {
726
+ "epoch": 0.8033068164092966,
727
+ "grad_norm": 63661.3671875,
728
+ "learning_rate": 9.834659179535174e-06,
729
+ "loss": 0.8009,
730
+ "step": 10300
731
+ },
732
+ {
733
+ "epoch": 0.8111059117142412,
734
+ "grad_norm": 71024.375,
735
+ "learning_rate": 9.444704414287943e-06,
736
+ "loss": 0.8331,
737
+ "step": 10400
738
+ },
739
+ {
740
+ "epoch": 0.8189050070191858,
741
+ "grad_norm": 80821.6640625,
742
+ "learning_rate": 9.05474964904071e-06,
743
+ "loss": 0.8344,
744
+ "step": 10500
745
+ },
746
+ {
747
+ "epoch": 0.8267041023241304,
748
+ "grad_norm": 61162.19140625,
749
+ "learning_rate": 8.66479488379348e-06,
750
+ "loss": 0.8454,
751
+ "step": 10600
752
+ },
753
+ {
754
+ "epoch": 0.834503197629075,
755
+ "grad_norm": 64971.5390625,
756
+ "learning_rate": 8.274840118546249e-06,
757
+ "loss": 0.8247,
758
+ "step": 10700
759
+ },
760
+ {
761
+ "epoch": 0.8423022929340197,
762
+ "grad_norm": 64816.95703125,
763
+ "learning_rate": 7.884885353299018e-06,
764
+ "loss": 0.8142,
765
+ "step": 10800
766
+ },
767
+ {
768
+ "epoch": 0.8501013882389643,
769
+ "grad_norm": 60450.6875,
770
+ "learning_rate": 7.494930588051786e-06,
771
+ "loss": 0.8128,
772
+ "step": 10900
773
+ },
774
+ {
775
+ "epoch": 0.8579004835439089,
776
+ "grad_norm": 71442.4375,
777
+ "learning_rate": 7.104975822804555e-06,
778
+ "loss": 0.8166,
779
+ "step": 11000
780
+ },
781
+ {
782
+ "epoch": 0.8656995788488535,
783
+ "grad_norm": 76537.1875,
784
+ "learning_rate": 6.715021057557324e-06,
785
+ "loss": 0.852,
786
+ "step": 11100
787
+ },
788
+ {
789
+ "epoch": 0.8734986741537981,
790
+ "grad_norm": 65081.37890625,
791
+ "learning_rate": 6.325066292310092e-06,
792
+ "loss": 0.8394,
793
+ "step": 11200
794
+ },
795
+ {
796
+ "epoch": 0.8812977694587428,
797
+ "grad_norm": 73682.296875,
798
+ "learning_rate": 5.935111527062861e-06,
799
+ "loss": 0.8165,
800
+ "step": 11300
801
+ },
802
+ {
803
+ "epoch": 0.8890968647636874,
804
+ "grad_norm": 73644.78125,
805
+ "learning_rate": 5.54515676181563e-06,
806
+ "loss": 0.8312,
807
+ "step": 11400
808
+ },
809
+ {
810
+ "epoch": 0.896895960068632,
811
+ "grad_norm": 79621.890625,
812
+ "learning_rate": 5.155201996568398e-06,
813
+ "loss": 0.8404,
814
+ "step": 11500
815
+ },
816
+ {
817
+ "epoch": 0.9046950553735766,
818
+ "grad_norm": 56045.7734375,
819
+ "learning_rate": 4.7652472313211666e-06,
820
+ "loss": 0.8295,
821
+ "step": 11600
822
+ },
823
+ {
824
+ "epoch": 0.9124941506785212,
825
+ "grad_norm": 66463.71875,
826
+ "learning_rate": 4.375292466073936e-06,
827
+ "loss": 0.8256,
828
+ "step": 11700
829
+ },
830
+ {
831
+ "epoch": 0.920293245983466,
832
+ "grad_norm": 58425.23046875,
833
+ "learning_rate": 3.985337700826705e-06,
834
+ "loss": 0.8253,
835
+ "step": 11800
836
+ },
837
+ {
838
+ "epoch": 0.9280923412884106,
839
+ "grad_norm": 68089.9609375,
840
+ "learning_rate": 3.595382935579473e-06,
841
+ "loss": 0.8161,
842
+ "step": 11900
843
+ },
844
+ {
845
+ "epoch": 0.9358914365933552,
846
+ "grad_norm": 63161.6796875,
847
+ "learning_rate": 3.2054281703322412e-06,
848
+ "loss": 0.8322,
849
+ "step": 12000
850
+ },
851
+ {
852
+ "epoch": 0.9436905318982998,
853
+ "grad_norm": 56655.69921875,
854
+ "learning_rate": 2.8154734050850103e-06,
855
+ "loss": 0.8117,
856
+ "step": 12100
857
+ },
858
+ {
859
+ "epoch": 0.9514896272032444,
860
+ "grad_norm": 72667.96875,
861
+ "learning_rate": 2.425518639837779e-06,
862
+ "loss": 0.8181,
863
+ "step": 12200
864
+ },
865
+ {
866
+ "epoch": 0.9592887225081891,
867
+ "grad_norm": 64209.17578125,
868
+ "learning_rate": 2.0355638745905476e-06,
869
+ "loss": 0.813,
870
+ "step": 12300
871
+ },
872
+ {
873
+ "epoch": 0.9670878178131337,
874
+ "grad_norm": 68720.7109375,
875
+ "learning_rate": 1.6456091093433163e-06,
876
+ "loss": 0.8343,
877
+ "step": 12400
878
+ },
879
+ {
880
+ "epoch": 0.9748869131180783,
881
+ "grad_norm": 62702.68359375,
882
+ "learning_rate": 1.255654344096085e-06,
883
+ "loss": 0.8156,
884
+ "step": 12500
885
+ },
886
+ {
887
+ "epoch": 0.9826860084230229,
888
+ "grad_norm": 62949.0625,
889
+ "learning_rate": 8.656995788488536e-07,
890
+ "loss": 0.8343,
891
+ "step": 12600
892
+ },
893
+ {
894
+ "epoch": 0.9904851037279676,
895
+ "grad_norm": 64140.0703125,
896
+ "learning_rate": 4.757448136016222e-07,
897
+ "loss": 0.8189,
898
+ "step": 12700
899
+ },
900
+ {
901
+ "epoch": 0.9982841990329122,
902
+ "grad_norm": 72402.578125,
903
+ "learning_rate": 8.579004835439089e-08,
904
+ "loss": 0.823,
905
+ "step": 12800
906
+ },
907
  {
908
  "epoch": 1.0,
909
+ "step": 12822,
910
+ "total_flos": 1.955920022807249e+18,
911
+ "train_loss": 0.8700304180128008,
912
+ "train_runtime": 179145.9612,
913
+ "train_samples_per_second": 0.859,
914
  "train_steps_per_second": 0.072
915
  }
916
  ],
917
  "logging_steps": 100,
918
+ "max_steps": 12822,
919
  "num_input_tokens_seen": 0,
920
  "num_train_epochs": 1,
921
+ "save_steps": 12822,
922
  "stateful_callbacks": {
923
  "TrainerControl": {
924
  "args": {
 
931
  "attributes": {}
932
  }
933
  },
934
+ "total_flos": 1.955920022807249e+18,
935
  "train_batch_size": 12,
936
  "trial_name": null,
937
  "trial_params": null