chansung commited on
Commit
da3efe6
1 Parent(s): 08805c3

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
3
  datasets:
4
- - llama-duo/synth_classification_dataset_dedup
5
  library_name: peft
6
  license: llama3
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # llama3-8b-classification-gpt4o-100k2
21
 
22
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the llama-duo/synth_classification_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.8048
25
 
26
  ## Model description
27
 
@@ -40,7 +39,7 @@ More information needed
40
  ### Training hyperparameters
41
 
42
  The following hyperparameters were used during training:
43
- - learning_rate: 0.0002
44
  - train_batch_size: 4
45
  - eval_batch_size: 2
46
  - seed: 42
@@ -58,11 +57,11 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:------:|:----:|:---------------:|
61
- | 1.4428 | 0.9978 | 225 | 1.7488 |
62
- | 1.3593 | 2.0 | 451 | 1.7692 |
63
- | 1.3157 | 2.9978 | 676 | 1.7700 |
64
- | 1.2688 | 4.0 | 902 | 1.7925 |
65
- | 1.2699 | 4.9889 | 1125 | 1.8048 |
66
 
67
 
68
  ### Framework versions
 
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
3
  datasets:
4
+ - generator
5
  library_name: peft
6
  license: llama3
7
  tags:
 
8
  - trl
9
  - sft
10
  - generated_from_trainer
 
18
 
19
  # llama3-8b-classification-gpt4o-100k2
20
 
21
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.9855
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 0.001
43
  - train_batch_size: 4
44
  - eval_batch_size: 2
45
  - seed: 42
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.3936 | 0.9978 | 225 | 1.7919 |
61
+ | 1.31 | 2.0 | 451 | 1.7869 |
62
+ | 1.2416 | 2.9978 | 676 | 1.8242 |
63
+ | 1.1447 | 4.0 | 902 | 1.9029 |
64
+ | 1.098 | 4.9889 | 1125 | 1.9855 |
65
 
66
 
67
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04745543e5ca5f0b2a29641e6b0ef9c0308e904787c2f48822c79e0bc14c96b3
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db7bd9c8302a14152825b1565098bb348e95672b1b681c278064f37d3247f78a
3
  size 54560368
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 4.988913525498892,
3
- "eval_loss": 1.8047916889190674,
4
- "eval_runtime": 0.3807,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 2.626,
7
- "eval_steps_per_second": 2.626,
8
  "total_flos": 1.6629843858229821e+18,
9
- "train_loss": 1.3908887059953479,
10
- "train_runtime": 3606.4613,
11
  "train_samples": 92634,
12
- "train_samples_per_second": 9.989,
13
  "train_steps_per_second": 0.312
14
  }
 
1
  {
2
  "epoch": 4.988913525498892,
 
 
 
 
 
3
  "total_flos": 1.6629843858229821e+18,
4
+ "train_loss": 1.2764637470245361,
5
+ "train_runtime": 3608.211,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 9.984,
8
  "train_steps_per_second": 0.312
9
  }
runs/Aug08_09-08-03_main-slippery-chicken-1-0-0/events.out.tfevents.1723122734.main-slippery-chicken-1-0-0.543.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6992b9b3411d48e794959d9c67adc79fcca93de2e8c95a7bf718b2fcf16dfec5
3
- size 54310
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9322fb344d2804ba15f9d3091935d3b469d27e371531332649e679ff481a7db7
3
+ size 55990
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 4.988913525498892,
3
  "total_flos": 1.6629843858229821e+18,
4
- "train_loss": 1.3908887059953479,
5
- "train_runtime": 3606.4613,
6
  "train_samples": 92634,
7
- "train_samples_per_second": 9.989,
8
  "train_steps_per_second": 0.312
9
  }
 
1
  {
2
  "epoch": 4.988913525498892,
3
  "total_flos": 1.6629843858229821e+18,
4
+ "train_loss": 1.2764637470245361,
5
+ "train_runtime": 3608.211,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 9.984,
8
  "train_steps_per_second": 0.312
9
  }
trainer_state.json CHANGED
@@ -10,1633 +10,1633 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004434589800443459,
13
- "grad_norm": 1.9136914014816284,
14
- "learning_rate": 1.7699115044247788e-06,
15
  "loss": 2.8127,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.022172949002217297,
20
- "grad_norm": 2.073108434677124,
21
- "learning_rate": 8.849557522123894e-06,
22
- "loss": 2.8249,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.04434589800443459,
27
- "grad_norm": 1.7010493278503418,
28
- "learning_rate": 1.7699115044247787e-05,
29
- "loss": 2.7382,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.06651884700665188,
34
- "grad_norm": 1.6392902135849,
35
- "learning_rate": 2.6548672566371686e-05,
36
- "loss": 2.6574,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.08869179600886919,
41
- "grad_norm": 1.1917133331298828,
42
- "learning_rate": 3.5398230088495574e-05,
43
- "loss": 2.4883,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.11086474501108648,
48
- "grad_norm": 0.7972588539123535,
49
- "learning_rate": 4.4247787610619477e-05,
50
- "loss": 2.3405,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.13303769401330376,
55
- "grad_norm": 0.8086889982223511,
56
- "learning_rate": 5.309734513274337e-05,
57
- "loss": 2.2187,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.15521064301552107,
62
- "grad_norm": 0.6612704396247864,
63
- "learning_rate": 6.194690265486725e-05,
64
- "loss": 2.1042,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.17738359201773837,
69
- "grad_norm": 0.6673336029052734,
70
- "learning_rate": 7.079646017699115e-05,
71
- "loss": 1.9928,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.19955654101995565,
76
- "grad_norm": 0.5582772493362427,
77
- "learning_rate": 7.964601769911504e-05,
78
- "loss": 1.9215,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.22172949002217296,
83
- "grad_norm": 0.43071338534355164,
84
- "learning_rate": 8.849557522123895e-05,
85
- "loss": 1.8657,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.24390243902439024,
90
- "grad_norm": 0.34688612818717957,
91
- "learning_rate": 9.734513274336283e-05,
92
- "loss": 1.7914,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.2660753880266075,
97
- "grad_norm": 0.3413448631763458,
98
- "learning_rate": 0.00010619469026548674,
99
- "loss": 1.7762,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.28824833702882485,
104
- "grad_norm": 0.313379168510437,
105
- "learning_rate": 0.00011504424778761063,
106
- "loss": 1.7352,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.31042128603104213,
111
- "grad_norm": 0.36069634556770325,
112
- "learning_rate": 0.0001238938053097345,
113
- "loss": 1.6967,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.3325942350332594,
118
- "grad_norm": 0.40712735056877136,
119
- "learning_rate": 0.00013274336283185842,
120
- "loss": 1.6816,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.35476718403547675,
125
- "grad_norm": 0.48003074526786804,
126
- "learning_rate": 0.0001415929203539823,
127
- "loss": 1.667,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.376940133037694,
132
- "grad_norm": 0.556786835193634,
133
- "learning_rate": 0.00015044247787610618,
134
- "loss": 1.6168,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.3991130820399113,
139
- "grad_norm": 0.41180524230003357,
140
- "learning_rate": 0.0001592920353982301,
141
- "loss": 1.596,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.4212860310421286,
146
- "grad_norm": 0.4306657910346985,
147
- "learning_rate": 0.000168141592920354,
148
- "loss": 1.569,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.4434589800443459,
153
- "grad_norm": 0.3725152313709259,
154
- "learning_rate": 0.0001769911504424779,
155
- "loss": 1.5671,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.4656319290465632,
160
- "grad_norm": 0.402552992105484,
161
- "learning_rate": 0.0001858407079646018,
162
- "loss": 1.5516,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.4878048780487805,
167
- "grad_norm": 0.3868615925312042,
168
- "learning_rate": 0.00019469026548672567,
169
- "loss": 1.534,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.5099778270509978,
174
- "grad_norm": 0.37014004588127136,
175
- "learning_rate": 0.00019999807262012045,
176
- "loss": 1.5358,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.532150776053215,
181
- "grad_norm": 0.3409845530986786,
182
- "learning_rate": 0.00019997639044970784,
183
- "loss": 1.5083,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.5543237250554324,
188
- "grad_norm": 0.42291566729545593,
189
- "learning_rate": 0.00019993062212508053,
190
- "loss": 1.5057,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.5764966740576497,
195
- "grad_norm": 0.3941720426082611,
196
- "learning_rate": 0.00019986077867267113,
197
- "loss": 1.4975,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.5986696230598669,
202
- "grad_norm": 0.47755852341651917,
203
- "learning_rate": 0.00019976687691905393,
204
- "loss": 1.4933,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.6208425720620843,
209
- "grad_norm": 0.5203377604484558,
210
- "learning_rate": 0.00019964893948689122,
211
- "loss": 1.5053,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.6430155210643016,
216
- "grad_norm": 0.3959715664386749,
217
- "learning_rate": 0.00019950699478948309,
218
- "loss": 1.4937,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.6651884700665188,
223
- "grad_norm": 0.3711565136909485,
224
- "learning_rate": 0.000199341077023922,
225
- "loss": 1.4995,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.6873614190687362,
230
- "grad_norm": 0.37361645698547363,
231
- "learning_rate": 0.00019915122616285418,
232
- "loss": 1.4863,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.7095343680709535,
237
- "grad_norm": 0.34434613585472107,
238
- "learning_rate": 0.00019893748794484948,
239
- "loss": 1.4752,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.7317073170731707,
244
- "grad_norm": 0.36264362931251526,
245
- "learning_rate": 0.0001986999138633821,
246
- "loss": 1.455,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.753880266075388,
251
- "grad_norm": 0.3910064995288849,
252
- "learning_rate": 0.00019843856115442482,
253
- "loss": 1.4627,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.7760532150776053,
258
- "grad_norm": 0.41227295994758606,
259
- "learning_rate": 0.00019815349278265988,
260
- "loss": 1.4723,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.7982261640798226,
265
- "grad_norm": 0.34940609335899353,
266
- "learning_rate": 0.00019784477742630952,
267
- "loss": 1.4692,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.8203991130820399,
272
- "grad_norm": 0.33894529938697815,
273
- "learning_rate": 0.00019751248946059014,
274
- "loss": 1.4513,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.8425720620842572,
279
- "grad_norm": 0.3944953978061676,
280
- "learning_rate": 0.00019715670893979414,
281
- "loss": 1.4504,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.8647450110864745,
286
- "grad_norm": 0.37704771757125854,
287
- "learning_rate": 0.00019677752157800312,
288
- "loss": 1.4383,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.8869179600886918,
293
- "grad_norm": 0.39401814341545105,
294
- "learning_rate": 0.0001963750187284379,
295
- "loss": 1.4524,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.9090909090909091,
300
- "grad_norm": 0.39860641956329346,
301
- "learning_rate": 0.00019594929736144976,
302
- "loss": 1.4363,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.9312638580931264,
307
- "grad_norm": 0.36418575048446655,
308
- "learning_rate": 0.0001955004600411586,
309
- "loss": 1.4199,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.9534368070953437,
314
- "grad_norm": 0.40804576873779297,
315
- "learning_rate": 0.0001950286149007434,
316
- "loss": 1.4151,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.975609756097561,
321
- "grad_norm": 0.4127281606197357,
322
- "learning_rate": 0.0001945338756163907,
323
- "loss": 1.4438,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.9977827050997783,
328
- "grad_norm": 0.4101060628890991,
329
- "learning_rate": 0.00019401636137990816,
330
- "loss": 1.4428,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.9977827050997783,
335
- "eval_loss": 1.748767375946045,
336
- "eval_runtime": 0.3757,
337
- "eval_samples_per_second": 2.662,
338
- "eval_steps_per_second": 2.662,
339
  "step": 225
340
  },
341
  {
342
  "epoch": 1.0199556541019956,
343
- "grad_norm": 0.5033897757530212,
344
- "learning_rate": 0.00019347619687000892,
345
- "loss": 1.4076,
346
  "step": 230
347
  },
348
  {
349
  "epoch": 1.042128603104213,
350
- "grad_norm": 0.34217771887779236,
351
- "learning_rate": 0.00019291351222227432,
352
- "loss": 1.4057,
353
  "step": 235
354
  },
355
  {
356
  "epoch": 1.06430155210643,
357
- "grad_norm": 0.3883781135082245,
358
- "learning_rate": 0.0001923284429978017,
359
- "loss": 1.4198,
360
  "step": 240
361
  },
362
  {
363
  "epoch": 1.0864745011086474,
364
- "grad_norm": 0.34347742795944214,
365
- "learning_rate": 0.00019172113015054532,
366
- "loss": 1.4052,
367
  "step": 245
368
  },
369
  {
370
  "epoch": 1.1086474501108647,
371
- "grad_norm": 0.4281429648399353,
372
- "learning_rate": 0.00019109171999335793,
373
- "loss": 1.4255,
374
  "step": 250
375
  },
376
  {
377
  "epoch": 1.130820399113082,
378
- "grad_norm": 0.377329558134079,
379
- "learning_rate": 0.00019044036416274133,
380
- "loss": 1.4081,
381
  "step": 255
382
  },
383
  {
384
  "epoch": 1.1529933481152994,
385
- "grad_norm": 0.3482955992221832,
386
- "learning_rate": 0.00018976721958231438,
387
- "loss": 1.3953,
388
  "step": 260
389
  },
390
  {
391
  "epoch": 1.1751662971175167,
392
- "grad_norm": 0.40098318457603455,
393
- "learning_rate": 0.00018907244842500704,
394
- "loss": 1.4021,
395
  "step": 265
396
  },
397
  {
398
  "epoch": 1.1973392461197339,
399
- "grad_norm": 0.43027186393737793,
400
- "learning_rate": 0.00018835621807399016,
401
- "loss": 1.4056,
402
  "step": 270
403
  },
404
  {
405
  "epoch": 1.2195121951219512,
406
- "grad_norm": 0.38251981139183044,
407
- "learning_rate": 0.0001876187010823496,
408
- "loss": 1.3814,
409
  "step": 275
410
  },
411
  {
412
  "epoch": 1.2416851441241685,
413
- "grad_norm": 0.330834299325943,
414
- "learning_rate": 0.00018686007513151514,
415
- "loss": 1.391,
416
  "step": 280
417
  },
418
  {
419
  "epoch": 1.2638580931263859,
420
- "grad_norm": 0.35464873909950256,
421
- "learning_rate": 0.0001860805229884536,
422
- "loss": 1.4068,
423
  "step": 285
424
  },
425
  {
426
  "epoch": 1.2860310421286032,
427
- "grad_norm": 0.365756094455719,
428
- "learning_rate": 0.00018528023246163717,
429
- "loss": 1.3833,
430
  "step": 290
431
  },
432
  {
433
  "epoch": 1.3082039911308203,
434
- "grad_norm": 0.36964625120162964,
435
- "learning_rate": 0.00018445939635579656,
436
- "loss": 1.3772,
437
  "step": 295
438
  },
439
  {
440
  "epoch": 1.3303769401330376,
441
- "grad_norm": 0.3860366642475128,
442
- "learning_rate": 0.0001836182124254711,
443
- "loss": 1.3901,
444
  "step": 300
445
  },
446
  {
447
  "epoch": 1.352549889135255,
448
- "grad_norm": 0.4819258153438568,
449
- "learning_rate": 0.00018275688332736577,
450
- "loss": 1.384,
451
  "step": 305
452
  },
453
  {
454
  "epoch": 1.3747228381374723,
455
- "grad_norm": 0.3682605028152466,
456
- "learning_rate": 0.00018187561657152757,
457
- "loss": 1.3729,
458
  "step": 310
459
  },
460
  {
461
  "epoch": 1.3968957871396896,
462
- "grad_norm": 0.35252609848976135,
463
- "learning_rate": 0.00018097462447135273,
464
- "loss": 1.3859,
465
  "step": 315
466
  },
467
  {
468
  "epoch": 1.4190687361419068,
469
- "grad_norm": 0.3672533631324768,
470
- "learning_rate": 0.00018005412409243606,
471
- "loss": 1.3677,
472
  "step": 320
473
  },
474
  {
475
  "epoch": 1.441241685144124,
476
- "grad_norm": 0.37152794003486633,
477
- "learning_rate": 0.00017911433720027624,
478
- "loss": 1.3611,
479
  "step": 325
480
  },
481
  {
482
  "epoch": 1.4634146341463414,
483
- "grad_norm": 0.4452725648880005,
484
- "learning_rate": 0.00017815549020684825,
485
- "loss": 1.3775,
486
  "step": 330
487
  },
488
  {
489
  "epoch": 1.4855875831485588,
490
- "grad_norm": 0.38919898867607117,
491
- "learning_rate": 0.0001771778141160566,
492
- "loss": 1.3799,
493
  "step": 335
494
  },
495
  {
496
  "epoch": 1.507760532150776,
497
- "grad_norm": 0.3549223244190216,
498
- "learning_rate": 0.0001761815444680822,
499
- "loss": 1.3807,
500
  "step": 340
501
  },
502
  {
503
  "epoch": 1.5299334811529932,
504
- "grad_norm": 0.3846336901187897,
505
- "learning_rate": 0.00017516692128263648,
506
- "loss": 1.3763,
507
  "step": 345
508
  },
509
  {
510
  "epoch": 1.5521064301552108,
511
- "grad_norm": 0.37670284509658813,
512
- "learning_rate": 0.00017413418900113605,
513
- "loss": 1.3678,
514
  "step": 350
515
  },
516
  {
517
  "epoch": 1.5742793791574279,
518
- "grad_norm": 0.37929922342300415,
519
- "learning_rate": 0.00017308359642781242,
520
- "loss": 1.377,
521
  "step": 355
522
  },
523
  {
524
  "epoch": 1.5964523281596452,
525
- "grad_norm": 0.41200995445251465,
526
- "learning_rate": 0.00017201539666977043,
527
- "loss": 1.3732,
528
  "step": 360
529
  },
530
  {
531
  "epoch": 1.6186252771618626,
532
- "grad_norm": 0.3412715196609497,
533
- "learning_rate": 0.0001709298470760101,
534
- "loss": 1.3726,
535
  "step": 365
536
  },
537
  {
538
  "epoch": 1.6407982261640797,
539
- "grad_norm": 0.3528231680393219,
540
- "learning_rate": 0.0001698272091754264,
541
- "loss": 1.3873,
542
  "step": 370
543
  },
544
  {
545
  "epoch": 1.6629711751662972,
546
- "grad_norm": 0.3747865855693817,
547
- "learning_rate": 0.00016870774861380228,
548
- "loss": 1.3665,
549
  "step": 375
550
  },
551
  {
552
  "epoch": 1.6851441241685143,
553
- "grad_norm": 0.40505874156951904,
554
- "learning_rate": 0.00016757173508980965,
555
- "loss": 1.3696,
556
  "step": 380
557
  },
558
  {
559
  "epoch": 1.7073170731707317,
560
- "grad_norm": 0.5333726406097412,
561
- "learning_rate": 0.00016641944229003395,
562
- "loss": 1.3748,
563
  "step": 385
564
  },
565
  {
566
  "epoch": 1.729490022172949,
567
- "grad_norm": 0.3569813370704651,
568
- "learning_rate": 0.00016525114782303807,
569
- "loss": 1.3631,
570
  "step": 390
571
  },
572
  {
573
  "epoch": 1.7516629711751663,
574
- "grad_norm": 0.3798842430114746,
575
- "learning_rate": 0.00016406713315248136,
576
- "loss": 1.3604,
577
  "step": 395
578
  },
579
  {
580
  "epoch": 1.7738359201773837,
581
- "grad_norm": 0.33182796835899353,
582
- "learning_rate": 0.00016286768352930973,
583
- "loss": 1.3692,
584
  "step": 400
585
  },
586
  {
587
  "epoch": 1.7960088691796008,
588
- "grad_norm": 0.3485672175884247,
589
- "learning_rate": 0.0001616530879230335,
590
- "loss": 1.3709,
591
  "step": 405
592
  },
593
  {
594
  "epoch": 1.8181818181818183,
595
- "grad_norm": 0.34393855929374695,
596
- "learning_rate": 0.00016042363895210946,
597
- "loss": 1.3494,
598
  "step": 410
599
  },
600
  {
601
  "epoch": 1.8403547671840355,
602
- "grad_norm": 0.34030023217201233,
603
- "learning_rate": 0.00015917963281344345,
604
- "loss": 1.3587,
605
  "step": 415
606
  },
607
  {
608
  "epoch": 1.8625277161862528,
609
- "grad_norm": 0.34356924891471863,
610
- "learning_rate": 0.00015792136921103124,
611
- "loss": 1.3473,
612
  "step": 420
613
  },
614
  {
615
  "epoch": 1.8847006651884701,
616
- "grad_norm": 0.3498888909816742,
617
- "learning_rate": 0.0001566491512837543,
618
- "loss": 1.3722,
619
  "step": 425
620
  },
621
  {
622
  "epoch": 1.9068736141906872,
623
- "grad_norm": 0.36746105551719666,
624
- "learning_rate": 0.00015536328553234792,
625
- "loss": 1.3743,
626
  "step": 430
627
  },
628
  {
629
  "epoch": 1.9290465631929048,
630
- "grad_norm": 0.33155015110969543,
631
- "learning_rate": 0.00015406408174555976,
632
- "loss": 1.3496,
633
  "step": 435
634
  },
635
  {
636
  "epoch": 1.951219512195122,
637
- "grad_norm": 0.3875883221626282,
638
- "learning_rate": 0.00015275185292551585,
639
- "loss": 1.3618,
640
  "step": 440
641
  },
642
  {
643
  "epoch": 1.9733924611973392,
644
- "grad_norm": 0.38502293825149536,
645
- "learning_rate": 0.00015142691521231267,
646
- "loss": 1.3572,
647
  "step": 445
648
  },
649
  {
650
  "epoch": 1.9955654101995566,
651
- "grad_norm": 0.382053405046463,
652
- "learning_rate": 0.0001500895878078532,
653
- "loss": 1.3593,
654
  "step": 450
655
  },
656
  {
657
  "epoch": 2.0,
658
- "eval_loss": 1.7692114114761353,
659
- "eval_runtime": 0.3373,
660
- "eval_samples_per_second": 2.965,
661
- "eval_steps_per_second": 2.965,
662
  "step": 451
663
  },
664
  {
665
  "epoch": 2.0177383592017737,
666
- "grad_norm": 0.35466691851615906,
667
- "learning_rate": 0.00014874019289894537,
668
- "loss": 1.3497,
669
  "step": 455
670
  },
671
  {
672
  "epoch": 2.0399113082039912,
673
- "grad_norm": 0.35485056042671204,
674
- "learning_rate": 0.00014737905557968105,
675
- "loss": 1.3208,
676
  "step": 460
677
  },
678
  {
679
  "epoch": 2.0620842572062084,
680
- "grad_norm": 0.3566678464412689,
681
- "learning_rate": 0.00014600650377311522,
682
- "loss": 1.3366,
683
  "step": 465
684
  },
685
  {
686
  "epoch": 2.084257206208426,
687
- "grad_norm": 0.34695708751678467,
688
- "learning_rate": 0.00014462286815226314,
689
- "loss": 1.3231,
690
  "step": 470
691
  },
692
  {
693
  "epoch": 2.106430155210643,
694
- "grad_norm": 0.4017435610294342,
695
- "learning_rate": 0.00014322848206043505,
696
- "loss": 1.3277,
697
  "step": 475
698
  },
699
  {
700
  "epoch": 2.12860310421286,
701
- "grad_norm": 0.3486361801624298,
702
- "learning_rate": 0.00014182368143092768,
703
- "loss": 1.325,
704
  "step": 480
705
  },
706
  {
707
  "epoch": 2.1507760532150777,
708
- "grad_norm": 0.37655895948410034,
709
- "learning_rate": 0.00014040880470609187,
710
- "loss": 1.3062,
711
  "step": 485
712
  },
713
  {
714
  "epoch": 2.172949002217295,
715
- "grad_norm": 0.36101478338241577,
716
- "learning_rate": 0.00013898419275579522,
717
- "loss": 1.3053,
718
  "step": 490
719
  },
720
  {
721
  "epoch": 2.1951219512195124,
722
- "grad_norm": 0.37982380390167236,
723
- "learning_rate": 0.00013755018879530075,
724
- "loss": 1.328,
725
  "step": 495
726
  },
727
  {
728
  "epoch": 2.2172949002217295,
729
- "grad_norm": 0.42325547337532043,
730
- "learning_rate": 0.00013610713830257954,
731
- "loss": 1.324,
732
  "step": 500
733
  },
734
  {
735
  "epoch": 2.2394678492239466,
736
- "grad_norm": 0.3722022473812103,
737
- "learning_rate": 0.00013465538893507907,
738
- "loss": 1.3335,
739
  "step": 505
740
  },
741
  {
742
  "epoch": 2.261640798226164,
743
- "grad_norm": 0.347001314163208,
744
- "learning_rate": 0.00013319529044596593,
745
- "loss": 1.3243,
746
  "step": 510
747
  },
748
  {
749
  "epoch": 2.2838137472283813,
750
- "grad_norm": 0.3675149381160736,
751
- "learning_rate": 0.00013172719459986397,
752
- "loss": 1.3185,
753
  "step": 515
754
  },
755
  {
756
  "epoch": 2.305986696230599,
757
- "grad_norm": 0.39877083897590637,
758
- "learning_rate": 0.0001302514550881076,
759
- "loss": 1.3114,
760
  "step": 520
761
  },
762
  {
763
  "epoch": 2.328159645232816,
764
- "grad_norm": 1.4218645095825195,
765
- "learning_rate": 0.00012876842744353112,
766
- "loss": 1.3175,
767
  "step": 525
768
  },
769
  {
770
  "epoch": 2.3503325942350335,
771
- "grad_norm": 0.3768659234046936,
772
- "learning_rate": 0.00012727846895481434,
773
- "loss": 1.3201,
774
  "step": 530
775
  },
776
  {
777
  "epoch": 2.3725055432372506,
778
- "grad_norm": 0.35776379704475403,
779
- "learning_rate": 0.00012578193858040507,
780
- "loss": 1.3274,
781
  "step": 535
782
  },
783
  {
784
  "epoch": 2.3946784922394677,
785
- "grad_norm": 0.3721405267715454,
786
- "learning_rate": 0.0001242791968620394,
787
- "loss": 1.3307,
788
  "step": 540
789
  },
790
  {
791
  "epoch": 2.4168514412416853,
792
- "grad_norm": 0.37725499272346497,
793
- "learning_rate": 0.00012277060583788064,
794
- "loss": 1.3324,
795
  "step": 545
796
  },
797
  {
798
  "epoch": 2.4390243902439024,
799
- "grad_norm": 0.3748611807823181,
800
- "learning_rate": 0.00012125652895529766,
801
- "loss": 1.3269,
802
  "step": 550
803
  },
804
  {
805
  "epoch": 2.4611973392461195,
806
- "grad_norm": 0.3902246952056885,
807
- "learning_rate": 0.00011973733098330368,
808
- "loss": 1.3257,
809
  "step": 555
810
  },
811
  {
812
  "epoch": 2.483370288248337,
813
- "grad_norm": 0.37103980779647827,
814
- "learning_rate": 0.0001182133779246766,
815
- "loss": 1.3073,
816
  "step": 560
817
  },
818
  {
819
  "epoch": 2.505543237250554,
820
- "grad_norm": 0.38661038875579834,
821
- "learning_rate": 0.00011668503692778239,
822
- "loss": 1.3261,
823
  "step": 565
824
  },
825
  {
826
  "epoch": 2.5277161862527717,
827
- "grad_norm": 0.3725627362728119,
828
- "learning_rate": 0.00011515267619812214,
829
- "loss": 1.3207,
830
  "step": 570
831
  },
832
  {
833
  "epoch": 2.549889135254989,
834
- "grad_norm": 0.3544328212738037,
835
- "learning_rate": 0.00011361666490962468,
836
- "loss": 1.3254,
837
  "step": 575
838
  },
839
  {
840
  "epoch": 2.5720620842572064,
841
- "grad_norm": 0.37323811650276184,
842
- "learning_rate": 0.00011207737311570559,
843
- "loss": 1.3216,
844
  "step": 580
845
  },
846
  {
847
  "epoch": 2.5942350332594235,
848
- "grad_norm": 0.42373356223106384,
849
- "learning_rate": 0.00011053517166011471,
850
- "loss": 1.3131,
851
  "step": 585
852
  },
853
  {
854
  "epoch": 2.6164079822616406,
855
- "grad_norm": 0.37566041946411133,
856
- "learning_rate": 0.00010899043208759305,
857
- "loss": 1.3357,
858
  "step": 590
859
  },
860
  {
861
  "epoch": 2.638580931263858,
862
- "grad_norm": 0.36090078949928284,
863
- "learning_rate": 0.00010744352655436059,
864
- "loss": 1.3416,
865
  "step": 595
866
  },
867
  {
868
  "epoch": 2.6607538802660753,
869
- "grad_norm": 0.3593694567680359,
870
- "learning_rate": 0.00010589482773845727,
871
- "loss": 1.3277,
872
  "step": 600
873
  },
874
  {
875
  "epoch": 2.682926829268293,
876
- "grad_norm": 0.356411337852478,
877
- "learning_rate": 0.00010434470874995781,
878
- "loss": 1.3114,
879
  "step": 605
880
  },
881
  {
882
  "epoch": 2.70509977827051,
883
- "grad_norm": 0.376769095659256,
884
- "learning_rate": 0.00010279354304108271,
885
- "loss": 1.3027,
886
  "step": 610
887
  },
888
  {
889
  "epoch": 2.7272727272727275,
890
- "grad_norm": 0.3567696213722229,
891
- "learning_rate": 0.0001012417043162266,
892
- "loss": 1.3058,
893
  "step": 615
894
  },
895
  {
896
  "epoch": 2.7494456762749446,
897
- "grad_norm": 0.3865494728088379,
898
- "learning_rate": 9.968956644192617e-05,
899
- "loss": 1.3247,
900
  "step": 620
901
  },
902
  {
903
  "epoch": 2.7716186252771617,
904
- "grad_norm": 0.3545072674751282,
905
- "learning_rate": 9.813750335678866e-05,
906
- "loss": 1.319,
907
  "step": 625
908
  },
909
  {
910
  "epoch": 2.7937915742793793,
911
- "grad_norm": 0.3772957921028137,
912
- "learning_rate": 9.658588898140322e-05,
913
- "loss": 1.3151,
914
  "step": 630
915
  },
916
  {
917
  "epoch": 2.8159645232815964,
918
- "grad_norm": 0.3590118885040283,
919
- "learning_rate": 9.503509712825658e-05,
920
- "loss": 1.3115,
921
  "step": 635
922
  },
923
  {
924
  "epoch": 2.8381374722838135,
925
- "grad_norm": 0.3592722713947296,
926
- "learning_rate": 9.348550141167472e-05,
927
- "loss": 1.2971,
928
  "step": 640
929
  },
930
  {
931
  "epoch": 2.860310421286031,
932
- "grad_norm": 0.35377687215805054,
933
- "learning_rate": 9.193747515781224e-05,
934
- "loss": 1.3128,
935
  "step": 645
936
  },
937
  {
938
  "epoch": 2.882483370288248,
939
- "grad_norm": 0.36321666836738586,
940
- "learning_rate": 9.039139131471128e-05,
941
- "loss": 1.3141,
942
  "step": 650
943
  },
944
  {
945
  "epoch": 2.9046563192904657,
946
- "grad_norm": 0.36464789509773254,
947
- "learning_rate": 8.884762236245145e-05,
948
- "loss": 1.3136,
949
  "step": 655
950
  },
951
  {
952
  "epoch": 2.926829268292683,
953
- "grad_norm": 0.36149683594703674,
954
- "learning_rate": 8.730654022341256e-05,
955
- "loss": 1.3242,
956
  "step": 660
957
  },
958
  {
959
  "epoch": 2.9490022172949004,
960
- "grad_norm": 0.3480617105960846,
961
- "learning_rate": 8.57685161726715e-05,
962
- "loss": 1.3297,
963
  "step": 665
964
  },
965
  {
966
  "epoch": 2.9711751662971175,
967
- "grad_norm": 0.3669663965702057,
968
- "learning_rate": 8.423392074855545e-05,
969
- "loss": 1.3073,
970
  "step": 670
971
  },
972
  {
973
  "epoch": 2.9933481152993346,
974
- "grad_norm": 0.34812483191490173,
975
- "learning_rate": 8.270312366337226e-05,
976
- "loss": 1.3157,
977
  "step": 675
978
  },
979
  {
980
  "epoch": 2.9977827050997785,
981
- "eval_loss": 1.7700002193450928,
982
- "eval_runtime": 0.3583,
983
- "eval_samples_per_second": 2.791,
984
- "eval_steps_per_second": 2.791,
985
  "step": 676
986
  },
987
  {
988
  "epoch": 3.015521064301552,
989
- "grad_norm": 0.3915573060512543,
990
- "learning_rate": 8.117649371433994e-05,
991
- "loss": 1.298,
992
  "step": 680
993
  },
994
  {
995
  "epoch": 3.0376940133037693,
996
- "grad_norm": 0.37077027559280396,
997
- "learning_rate": 7.965439869473664e-05,
998
- "loss": 1.2853,
999
  "step": 685
1000
  },
1001
  {
1002
  "epoch": 3.059866962305987,
1003
- "grad_norm": 0.3731926381587982,
1004
- "learning_rate": 7.813720530529243e-05,
1005
- "loss": 1.2908,
1006
  "step": 690
1007
  },
1008
  {
1009
  "epoch": 3.082039911308204,
1010
- "grad_norm": 0.37934252619743347,
1011
- "learning_rate": 7.66252790658445e-05,
1012
- "loss": 1.2732,
1013
  "step": 695
1014
  },
1015
  {
1016
  "epoch": 3.104212860310421,
1017
- "grad_norm": 0.4166705310344696,
1018
- "learning_rate": 7.511898422727642e-05,
1019
- "loss": 1.2702,
1020
  "step": 700
1021
  },
1022
  {
1023
  "epoch": 3.1263858093126387,
1024
- "grad_norm": 0.3665112257003784,
1025
- "learning_rate": 7.361868368376364e-05,
1026
- "loss": 1.2801,
1027
  "step": 705
1028
  },
1029
  {
1030
  "epoch": 3.1485587583148558,
1031
- "grad_norm": 0.393635094165802,
1032
- "learning_rate": 7.212473888534546e-05,
1033
- "loss": 1.2873,
1034
  "step": 710
1035
  },
1036
  {
1037
  "epoch": 3.1707317073170733,
1038
- "grad_norm": 0.37354576587677,
1039
- "learning_rate": 7.063750975084518e-05,
1040
- "loss": 1.2823,
1041
  "step": 715
1042
  },
1043
  {
1044
  "epoch": 3.1929046563192904,
1045
- "grad_norm": 0.3939523994922638,
1046
- "learning_rate": 6.915735458115884e-05,
1047
- "loss": 1.2766,
1048
  "step": 720
1049
  },
1050
  {
1051
  "epoch": 3.2150776053215075,
1052
- "grad_norm": 0.3616690933704376,
1053
- "learning_rate": 6.768462997293413e-05,
1054
- "loss": 1.2892,
1055
  "step": 725
1056
  },
1057
  {
1058
  "epoch": 3.237250554323725,
1059
- "grad_norm": 0.3850267231464386,
1060
- "learning_rate": 6.62196907326595e-05,
1061
- "loss": 1.2981,
1062
  "step": 730
1063
  },
1064
  {
1065
  "epoch": 3.259423503325942,
1066
- "grad_norm": 0.39054858684539795,
1067
- "learning_rate": 6.476288979118496e-05,
1068
- "loss": 1.3101,
1069
  "step": 735
1070
  },
1071
  {
1072
  "epoch": 3.2815964523281598,
1073
- "grad_norm": 0.38400739431381226,
1074
- "learning_rate": 6.331457811869437e-05,
1075
- "loss": 1.2923,
1076
  "step": 740
1077
  },
1078
  {
1079
  "epoch": 3.303769401330377,
1080
- "grad_norm": 0.3852229714393616,
1081
- "learning_rate": 6.187510464015022e-05,
1082
- "loss": 1.2801,
1083
  "step": 745
1084
  },
1085
  {
1086
  "epoch": 3.3259423503325944,
1087
- "grad_norm": 0.3806511461734772,
1088
- "learning_rate": 6.0444816151231375e-05,
1089
- "loss": 1.2816,
1090
  "step": 750
1091
  },
1092
  {
1093
  "epoch": 3.3481152993348116,
1094
- "grad_norm": 0.3717353045940399,
1095
- "learning_rate": 5.902405723478346e-05,
1096
- "loss": 1.2615,
1097
  "step": 755
1098
  },
1099
  {
1100
  "epoch": 3.3702882483370287,
1101
- "grad_norm": 0.3781892657279968,
1102
- "learning_rate": 5.76131701778025e-05,
1103
- "loss": 1.2936,
1104
  "step": 760
1105
  },
1106
  {
1107
  "epoch": 3.3924611973392462,
1108
- "grad_norm": 0.38951125741004944,
1109
- "learning_rate": 5.621249488897176e-05,
1110
- "loss": 1.2719,
1111
  "step": 765
1112
  },
1113
  {
1114
  "epoch": 3.4146341463414633,
1115
- "grad_norm": 0.6626437306404114,
1116
- "learning_rate": 5.4822368816771406e-05,
1117
- "loss": 1.2889,
1118
  "step": 770
1119
  },
1120
  {
1121
  "epoch": 3.436807095343681,
1122
- "grad_norm": 0.3741094172000885,
1123
- "learning_rate": 5.344312686818106e-05,
1124
- "loss": 1.2795,
1125
  "step": 775
1126
  },
1127
  {
1128
  "epoch": 3.458980044345898,
1129
- "grad_norm": 0.38097867369651794,
1130
- "learning_rate": 5.207510132799436e-05,
1131
- "loss": 1.2923,
1132
  "step": 780
1133
  },
1134
  {
1135
  "epoch": 3.481152993348115,
1136
- "grad_norm": 0.3767342269420624,
1137
- "learning_rate": 5.0718621778765476e-05,
1138
- "loss": 1.278,
1139
  "step": 785
1140
  },
1141
  {
1142
  "epoch": 3.5033259423503327,
1143
- "grad_norm": 0.37775152921676636,
1144
- "learning_rate": 4.9374015021406914e-05,
1145
- "loss": 1.2681,
1146
  "step": 790
1147
  },
1148
  {
1149
  "epoch": 3.52549889135255,
1150
- "grad_norm": 0.3940396010875702,
1151
- "learning_rate": 4.804160499645667e-05,
1152
- "loss": 1.2799,
1153
  "step": 795
1154
  },
1155
  {
1156
  "epoch": 3.5476718403547673,
1157
- "grad_norm": 0.3780502378940582,
1158
- "learning_rate": 4.6721712706035236e-05,
1159
- "loss": 1.2897,
1160
  "step": 800
1161
  },
1162
  {
1163
  "epoch": 3.5698447893569845,
1164
- "grad_norm": 0.3779968321323395,
1165
- "learning_rate": 4.5414656136510334e-05,
1166
- "loss": 1.2842,
1167
  "step": 805
1168
  },
1169
  {
1170
  "epoch": 3.5920177383592016,
1171
- "grad_norm": 0.38577476143836975,
1172
- "learning_rate": 4.412075018188805e-05,
1173
- "loss": 1.2793,
1174
  "step": 810
1175
  },
1176
  {
1177
  "epoch": 3.614190687361419,
1178
- "grad_norm": 0.3731294274330139,
1179
- "learning_rate": 4.2840306567949076e-05,
1180
- "loss": 1.2809,
1181
  "step": 815
1182
  },
1183
  {
1184
  "epoch": 3.6363636363636362,
1185
- "grad_norm": 0.37152302265167236,
1186
- "learning_rate": 4.157363377714819e-05,
1187
- "loss": 1.3013,
1188
  "step": 820
1189
  },
1190
  {
1191
  "epoch": 3.658536585365854,
1192
- "grad_norm": 0.38100916147232056,
1193
- "learning_rate": 4.0321036974295156e-05,
1194
- "loss": 1.2836,
1195
  "step": 825
1196
  },
1197
  {
1198
  "epoch": 3.680709534368071,
1199
- "grad_norm": 0.3642306625843048,
1200
- "learning_rate": 3.9082817933035134e-05,
1201
- "loss": 1.2839,
1202
  "step": 830
1203
  },
1204
  {
1205
  "epoch": 3.7028824833702885,
1206
- "grad_norm": 0.3758902847766876,
1207
- "learning_rate": 3.785927496314543e-05,
1208
- "loss": 1.2876,
1209
  "step": 835
1210
  },
1211
  {
1212
  "epoch": 3.7250554323725056,
1213
- "grad_norm": 0.3736809194087982,
1214
- "learning_rate": 3.6650702838667464e-05,
1215
- "loss": 1.2741,
1216
  "step": 840
1217
  },
1218
  {
1219
  "epoch": 3.7472283813747227,
1220
- "grad_norm": 0.36693236231803894,
1221
- "learning_rate": 3.5457392726890236e-05,
1222
- "loss": 1.2791,
1223
  "step": 845
1224
  },
1225
  {
1226
  "epoch": 3.7694013303769403,
1227
- "grad_norm": 0.38247162103652954,
1228
- "learning_rate": 3.427963211820274e-05,
1229
- "loss": 1.2813,
1230
  "step": 850
1231
  },
1232
  {
1233
  "epoch": 3.7915742793791574,
1234
- "grad_norm": 0.37350913882255554,
1235
- "learning_rate": 3.3117704756832226e-05,
1236
- "loss": 1.29,
1237
  "step": 855
1238
  },
1239
  {
1240
  "epoch": 3.8137472283813745,
1241
- "grad_norm": 0.44421860575675964,
1242
- "learning_rate": 3.197189057248491e-05,
1243
- "loss": 1.2773,
1244
  "step": 860
1245
  },
1246
  {
1247
  "epoch": 3.835920177383592,
1248
- "grad_norm": 0.3692834973335266,
1249
- "learning_rate": 3.0842465612905837e-05,
1250
- "loss": 1.2869,
1251
  "step": 865
1252
  },
1253
  {
1254
  "epoch": 3.858093126385809,
1255
- "grad_norm": 0.375489205121994,
1256
- "learning_rate": 2.9729701977374035e-05,
1257
- "loss": 1.2846,
1258
  "step": 870
1259
  },
1260
  {
1261
  "epoch": 3.8802660753880267,
1262
- "grad_norm": 0.3838193714618683,
1263
- "learning_rate": 2.863386775114848e-05,
1264
- "loss": 1.294,
1265
  "step": 875
1266
  },
1267
  {
1268
  "epoch": 3.902439024390244,
1269
- "grad_norm": 0.38194307684898376,
1270
- "learning_rate": 2.7555226940881583e-05,
1271
- "loss": 1.2838,
1272
  "step": 880
1273
  },
1274
  {
1275
  "epoch": 3.9246119733924614,
1276
- "grad_norm": 0.3884502947330475,
1277
- "learning_rate": 2.6494039411015193e-05,
1278
- "loss": 1.301,
1279
  "step": 885
1280
  },
1281
  {
1282
  "epoch": 3.9467849223946785,
1283
- "grad_norm": 0.38043659925460815,
1284
- "learning_rate": 2.545056082117433e-05,
1285
- "loss": 1.2984,
1286
  "step": 890
1287
  },
1288
  {
1289
  "epoch": 3.9689578713968956,
1290
- "grad_norm": 0.37292882800102234,
1291
- "learning_rate": 2.4425042564574184e-05,
1292
- "loss": 1.2826,
1293
  "step": 895
1294
  },
1295
  {
1296
  "epoch": 3.991130820399113,
1297
- "grad_norm": 0.37573322653770447,
1298
- "learning_rate": 2.3417731707454737e-05,
1299
- "loss": 1.2688,
1300
  "step": 900
1301
  },
1302
  {
1303
  "epoch": 4.0,
1304
- "eval_loss": 1.7925071716308594,
1305
- "eval_runtime": 0.3371,
1306
- "eval_samples_per_second": 2.967,
1307
- "eval_steps_per_second": 2.967,
1308
  "step": 902
1309
  },
1310
  {
1311
  "epoch": 4.013303769401331,
1312
- "grad_norm": 0.37189018726348877,
1313
- "learning_rate": 2.242887092955801e-05,
1314
- "loss": 1.2666,
1315
  "step": 905
1316
  },
1317
  {
1318
  "epoch": 4.035476718403547,
1319
- "grad_norm": 0.37796613574028015,
1320
- "learning_rate": 2.1458698465662187e-05,
1321
- "loss": 1.2526,
1322
  "step": 910
1323
  },
1324
  {
1325
  "epoch": 4.057649667405765,
1326
- "grad_norm": 0.3842335343360901,
1327
- "learning_rate": 2.0507448048186208e-05,
1328
- "loss": 1.2523,
1329
  "step": 915
1330
  },
1331
  {
1332
  "epoch": 4.0798226164079825,
1333
- "grad_norm": 0.38330337405204773,
1334
- "learning_rate": 1.957534885087944e-05,
1335
- "loss": 1.2627,
1336
  "step": 920
1337
  },
1338
  {
1339
  "epoch": 4.101995565410199,
1340
- "grad_norm": 0.37810632586479187,
1341
- "learning_rate": 1.866262543360958e-05,
1342
- "loss": 1.2702,
1343
  "step": 925
1344
  },
1345
  {
1346
  "epoch": 4.124168514412417,
1347
- "grad_norm": 0.39427462220191956,
1348
- "learning_rate": 1.7769497688261973e-05,
1349
- "loss": 1.2574,
1350
  "step": 930
1351
  },
1352
  {
1353
  "epoch": 4.146341463414634,
1354
- "grad_norm": 0.39581403136253357,
1355
- "learning_rate": 1.6896180785763593e-05,
1356
- "loss": 1.2577,
1357
  "step": 935
1358
  },
1359
  {
1360
  "epoch": 4.168514412416852,
1361
- "grad_norm": 0.3902391195297241,
1362
- "learning_rate": 1.604288512424439e-05,
1363
- "loss": 1.2624,
1364
  "step": 940
1365
  },
1366
  {
1367
  "epoch": 4.1906873614190685,
1368
- "grad_norm": 0.3963169753551483,
1369
- "learning_rate": 1.520981627834851e-05,
1370
- "loss": 1.2757,
1371
  "step": 945
1372
  },
1373
  {
1374
  "epoch": 4.212860310421286,
1375
- "grad_norm": 0.379292756319046,
1376
- "learning_rate": 1.4397174949707725e-05,
1377
- "loss": 1.2572,
1378
  "step": 950
1379
  },
1380
  {
1381
  "epoch": 4.235033259423504,
1382
- "grad_norm": 0.3748960494995117,
1383
- "learning_rate": 1.3605156918588469e-05,
1384
- "loss": 1.2713,
1385
  "step": 955
1386
  },
1387
  {
1388
  "epoch": 4.25720620842572,
1389
- "grad_norm": 0.3822687268257141,
1390
- "learning_rate": 1.2833952996724863e-05,
1391
- "loss": 1.2634,
1392
  "step": 960
1393
  },
1394
  {
1395
  "epoch": 4.279379157427938,
1396
- "grad_norm": 0.3805009126663208,
1397
- "learning_rate": 1.208374898134883e-05,
1398
- "loss": 1.2715,
1399
  "step": 965
1400
  },
1401
  {
1402
  "epoch": 4.301552106430155,
1403
- "grad_norm": 0.38446545600891113,
1404
- "learning_rate": 1.1354725610427807e-05,
1405
- "loss": 1.2776,
1406
  "step": 970
1407
  },
1408
  {
1409
  "epoch": 4.323725055432373,
1410
- "grad_norm": 0.37589287757873535,
1411
- "learning_rate": 1.0647058519121821e-05,
1412
- "loss": 1.2405,
1413
  "step": 975
1414
  },
1415
  {
1416
  "epoch": 4.34589800443459,
1417
- "grad_norm": 0.38052481412887573,
1418
- "learning_rate": 9.960918197469771e-06,
1419
- "loss": 1.2674,
1420
  "step": 980
1421
  },
1422
  {
1423
  "epoch": 4.368070953436807,
1424
- "grad_norm": 0.3807436227798462,
1425
- "learning_rate": 9.296469949315156e-06,
1426
- "loss": 1.2721,
1427
  "step": 985
1428
  },
1429
  {
1430
  "epoch": 4.390243902439025,
1431
- "grad_norm": 0.38531798124313354,
1432
- "learning_rate": 8.653873852481364e-06,
1433
- "loss": 1.2474,
1434
  "step": 990
1435
  },
1436
  {
1437
  "epoch": 4.412416851441241,
1438
- "grad_norm": 0.3917078673839569,
1439
- "learning_rate": 8.033284720205946e-06,
1440
- "loss": 1.2672,
1441
  "step": 995
1442
  },
1443
  {
1444
  "epoch": 4.434589800443459,
1445
- "grad_norm": 0.372130423784256,
1446
- "learning_rate": 7.434852063843278e-06,
1447
- "loss": 1.2621,
1448
  "step": 1000
1449
  },
1450
  {
1451
  "epoch": 4.4567627494456765,
1452
- "grad_norm": 0.37912076711654663,
1453
- "learning_rate": 6.858720056844614e-06,
1454
- "loss": 1.2607,
1455
  "step": 1005
1456
  },
1457
  {
1458
  "epoch": 4.478935698447893,
1459
- "grad_norm": 0.39060723781585693,
1460
- "learning_rate": 6.3050275000238414e-06,
1461
- "loss": 1.2672,
1462
  "step": 1010
1463
  },
1464
  {
1465
  "epoch": 4.501108647450111,
1466
- "grad_norm": 0.3805095851421356,
1467
- "learning_rate": 5.77390778811796e-06,
1468
- "loss": 1.2756,
1469
  "step": 1015
1470
  },
1471
  {
1472
  "epoch": 4.523281596452328,
1473
- "grad_norm": 0.39004358649253845,
1474
- "learning_rate": 5.265488877649816e-06,
1475
- "loss": 1.2687,
1476
  "step": 1020
1477
  },
1478
  {
1479
  "epoch": 4.545454545454545,
1480
- "grad_norm": 0.3834868371486664,
1481
- "learning_rate": 4.7798932561009865e-06,
1482
- "loss": 1.2681,
1483
  "step": 1025
1484
  },
1485
  {
1486
  "epoch": 4.5676274944567625,
1487
- "grad_norm": 0.3744969666004181,
1488
- "learning_rate": 4.317237912402316e-06,
1489
- "loss": 1.2697,
1490
  "step": 1030
1491
  },
1492
  {
1493
  "epoch": 4.58980044345898,
1494
- "grad_norm": 0.3825860321521759,
1495
- "learning_rate": 3.877634308749078e-06,
1496
- "loss": 1.2459,
1497
  "step": 1035
1498
  },
1499
  {
1500
  "epoch": 4.611973392461198,
1501
- "grad_norm": 0.38495102524757385,
1502
- "learning_rate": 3.461188353747702e-06,
1503
- "loss": 1.2642,
1504
  "step": 1040
1505
  },
1506
  {
1507
  "epoch": 4.634146341463414,
1508
- "grad_norm": 0.3805482089519501,
1509
- "learning_rate": 3.068000376900515e-06,
1510
- "loss": 1.2781,
1511
  "step": 1045
1512
  },
1513
  {
1514
  "epoch": 4.656319290465632,
1515
- "grad_norm": 0.3828931748867035,
1516
- "learning_rate": 2.6981651044344024e-06,
1517
- "loss": 1.2646,
1518
  "step": 1050
1519
  },
1520
  {
1521
  "epoch": 4.678492239467849,
1522
- "grad_norm": 0.38155028223991394,
1523
- "learning_rate": 2.3517716364795385e-06,
1524
- "loss": 1.2696,
1525
  "step": 1055
1526
  },
1527
  {
1528
  "epoch": 4.700665188470067,
1529
- "grad_norm": 0.3936275541782379,
1530
- "learning_rate": 2.028903425603612e-06,
1531
- "loss": 1.2635,
1532
  "step": 1060
1533
  },
1534
  {
1535
  "epoch": 4.722838137472284,
1536
- "grad_norm": 0.37505555152893066,
1537
- "learning_rate": 1.7296382567064672e-06,
1538
- "loss": 1.2617,
1539
  "step": 1065
1540
  },
1541
  {
1542
  "epoch": 4.745011086474501,
1543
- "grad_norm": 0.3843250572681427,
1544
- "learning_rate": 1.4540482282803137e-06,
1545
- "loss": 1.246,
1546
  "step": 1070
1547
  },
1548
  {
1549
  "epoch": 4.767184035476719,
1550
- "grad_norm": 0.388374388217926,
1551
- "learning_rate": 1.2021997350399106e-06,
1552
- "loss": 1.2853,
1553
  "step": 1075
1554
  },
1555
  {
1556
  "epoch": 4.789356984478935,
1557
- "grad_norm": 0.3848237991333008,
1558
- "learning_rate": 9.741534519267736e-07,
1559
- "loss": 1.2703,
1560
  "step": 1080
1561
  },
1562
  {
1563
  "epoch": 4.811529933481153,
1564
- "grad_norm": 0.3839035928249359,
1565
- "learning_rate": 7.699643194915784e-07,
1566
- "loss": 1.2602,
1567
  "step": 1085
1568
  },
1569
  {
1570
  "epoch": 4.8337028824833705,
1571
- "grad_norm": 0.3856402337551117,
1572
- "learning_rate": 5.896815306578818e-07,
1573
- "loss": 1.2569,
1574
  "step": 1090
1575
  },
1576
  {
1577
  "epoch": 4.855875831485587,
1578
- "grad_norm": 0.38281336426734924,
1579
- "learning_rate": 4.333485188706576e-07,
1580
- "loss": 1.2648,
1581
  "step": 1095
1582
  },
1583
  {
1584
  "epoch": 4.878048780487805,
1585
- "grad_norm": 0.37251806259155273,
1586
- "learning_rate": 3.0100294763238946e-07,
1587
- "loss": 1.2769,
1588
  "step": 1100
1589
  },
1590
  {
1591
  "epoch": 4.900221729490022,
1592
- "grad_norm": 0.38437706232070923,
1593
- "learning_rate": 1.9267670142926187e-07,
1594
- "loss": 1.2534,
1595
  "step": 1105
1596
  },
1597
  {
1598
  "epoch": 4.922394678492239,
1599
- "grad_norm": 0.3764530420303345,
1600
- "learning_rate": 1.0839587804954975e-07,
1601
- "loss": 1.2557,
1602
  "step": 1110
1603
  },
1604
  {
1605
  "epoch": 4.9445676274944566,
1606
- "grad_norm": 0.3766900599002838,
1607
- "learning_rate": 4.818078229622547e-08,
1608
- "loss": 1.2779,
1609
  "step": 1115
1610
  },
1611
  {
1612
  "epoch": 4.966740576496674,
1613
- "grad_norm": 0.3775485157966614,
1614
- "learning_rate": 1.2045921095127366e-08,
1615
- "loss": 1.265,
1616
  "step": 1120
1617
  },
1618
  {
1619
  "epoch": 4.988913525498892,
1620
- "grad_norm": 0.38629209995269775,
1621
  "learning_rate": 0.0,
1622
- "loss": 1.2699,
1623
  "step": 1125
1624
  },
1625
  {
1626
  "epoch": 4.988913525498892,
1627
- "eval_loss": 1.8047916889190674,
1628
- "eval_runtime": 0.3388,
1629
- "eval_samples_per_second": 2.951,
1630
- "eval_steps_per_second": 2.951,
1631
  "step": 1125
1632
  },
1633
  {
1634
  "epoch": 4.988913525498892,
1635
  "step": 1125,
1636
  "total_flos": 1.6629843858229821e+18,
1637
- "train_loss": 1.3908887059953479,
1638
- "train_runtime": 3606.4613,
1639
- "train_samples_per_second": 9.989,
1640
  "train_steps_per_second": 0.312
1641
  }
1642
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004434589800443459,
13
+ "grad_norm": 1.911939263343811,
14
+ "learning_rate": 8.849557522123894e-06,
15
  "loss": 2.8127,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.022172949002217297,
20
+ "grad_norm": 2.071887731552124,
21
+ "learning_rate": 4.424778761061947e-05,
22
+ "loss": 2.8128,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.04434589800443459,
27
+ "grad_norm": 1.1280204057693481,
28
+ "learning_rate": 8.849557522123894e-05,
29
+ "loss": 2.5671,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.06651884700665188,
34
+ "grad_norm": 1.151498794555664,
35
+ "learning_rate": 0.00013274336283185842,
36
+ "loss": 2.3153,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.08869179600886919,
41
+ "grad_norm": 0.7351519465446472,
42
+ "learning_rate": 0.00017699115044247788,
43
+ "loss": 2.087,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.11086474501108648,
48
+ "grad_norm": 0.8526760339736938,
49
+ "learning_rate": 0.00022123893805309737,
50
+ "loss": 1.9278,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.13303769401330376,
55
+ "grad_norm": 0.4035344421863556,
56
+ "learning_rate": 0.00026548672566371683,
57
+ "loss": 1.8159,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.15521064301552107,
62
+ "grad_norm": 0.2873729169368744,
63
+ "learning_rate": 0.00030973451327433627,
64
+ "loss": 1.7511,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.17738359201773837,
69
+ "grad_norm": 0.2770038843154907,
70
+ "learning_rate": 0.00035398230088495576,
71
+ "loss": 1.7015,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.19955654101995565,
76
+ "grad_norm": 0.3265347480773926,
77
+ "learning_rate": 0.00039823008849557525,
78
+ "loss": 1.6655,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.22172949002217296,
83
+ "grad_norm": 0.5982062220573425,
84
+ "learning_rate": 0.00044247787610619474,
85
+ "loss": 1.6314,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.24390243902439024,
90
+ "grad_norm": 0.4608590602874756,
91
+ "learning_rate": 0.0004867256637168142,
92
+ "loss": 1.571,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.2660753880266075,
97
+ "grad_norm": 0.4388026297092438,
98
+ "learning_rate": 0.0005309734513274337,
99
+ "loss": 1.5656,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.28824833702882485,
104
+ "grad_norm": 0.3848969638347626,
105
+ "learning_rate": 0.0005752212389380532,
106
+ "loss": 1.5407,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.31042128603104213,
111
+ "grad_norm": 0.2577425241470337,
112
+ "learning_rate": 0.0006194690265486725,
113
+ "loss": 1.5097,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.3325942350332594,
118
+ "grad_norm": 0.38717567920684814,
119
+ "learning_rate": 0.0006637168141592921,
120
+ "loss": 1.5087,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.35476718403547675,
125
+ "grad_norm": 0.2715625464916229,
126
+ "learning_rate": 0.0007079646017699115,
127
+ "loss": 1.5023,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.376940133037694,
132
+ "grad_norm": 0.29234352707862854,
133
+ "learning_rate": 0.0007522123893805309,
134
+ "loss": 1.4785,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.3991130820399113,
139
+ "grad_norm": 0.28365185856819153,
140
+ "learning_rate": 0.0007964601769911505,
141
+ "loss": 1.4721,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.4212860310421286,
146
+ "grad_norm": 0.32057851552963257,
147
+ "learning_rate": 0.0008407079646017699,
148
+ "loss": 1.4595,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.4434589800443459,
153
+ "grad_norm": 0.30948102474212646,
154
+ "learning_rate": 0.0008849557522123895,
155
+ "loss": 1.4668,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.4656319290465632,
160
+ "grad_norm": 0.31636759638786316,
161
+ "learning_rate": 0.0009292035398230089,
162
+ "loss": 1.4544,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.4878048780487805,
167
+ "grad_norm": 0.24775166809558868,
168
+ "learning_rate": 0.0009734513274336283,
169
+ "loss": 1.4405,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.5099778270509978,
174
+ "grad_norm": 0.24200735986232758,
175
+ "learning_rate": 0.0009999903631006022,
176
+ "loss": 1.4508,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.532150776053215,
181
+ "grad_norm": 0.306389719247818,
182
+ "learning_rate": 0.0009998819522485391,
183
+ "loss": 1.4232,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.5543237250554324,
188
+ "grad_norm": 0.2549804449081421,
189
+ "learning_rate": 0.0009996531106254026,
190
+ "loss": 1.4249,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.5764966740576497,
195
+ "grad_norm": 0.24480336904525757,
196
+ "learning_rate": 0.0009993038933633555,
197
+ "loss": 1.4142,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.5986696230598669,
202
+ "grad_norm": 0.2402483969926834,
203
+ "learning_rate": 0.0009988343845952696,
204
+ "loss": 1.414,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.6208425720620843,
209
+ "grad_norm": 0.22881096601486206,
210
+ "learning_rate": 0.000998244697434456,
211
+ "loss": 1.4312,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.6430155210643016,
216
+ "grad_norm": 0.2914198637008667,
217
+ "learning_rate": 0.0009975349739474153,
218
+ "loss": 1.417,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.6651884700665188,
223
+ "grad_norm": 0.2460290491580963,
224
+ "learning_rate": 0.0009967053851196099,
225
+ "loss": 1.4257,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.6873614190687362,
230
+ "grad_norm": 0.22039610147476196,
231
+ "learning_rate": 0.0009957561308142709,
232
+ "loss": 1.4142,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.7095343680709535,
237
+ "grad_norm": 0.24831444025039673,
238
+ "learning_rate": 0.0009946874397242474,
239
+ "loss": 1.403,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.7317073170731707,
244
+ "grad_norm": 0.2765662372112274,
245
+ "learning_rate": 0.0009934995693169104,
246
+ "loss": 1.3855,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.753880266075388,
251
+ "grad_norm": 0.26668626070022583,
252
+ "learning_rate": 0.0009921928057721242,
253
+ "loss": 1.3983,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.7760532150776053,
258
+ "grad_norm": 0.305702805519104,
259
+ "learning_rate": 0.0009907674639132995,
260
+ "loss": 1.4083,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.7982261640798226,
265
+ "grad_norm": 0.3682064414024353,
266
+ "learning_rate": 0.0009892238871315475,
267
+ "loss": 1.4011,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.8203991130820399,
272
+ "grad_norm": 0.2574247419834137,
273
+ "learning_rate": 0.0009875624473029507,
274
+ "loss": 1.3962,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.8425720620842572,
279
+ "grad_norm": 0.42513400316238403,
280
+ "learning_rate": 0.0009857835446989707,
281
+ "loss": 1.3924,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.8647450110864745,
286
+ "grad_norm": 0.3164115250110626,
287
+ "learning_rate": 0.0009838876078900156,
288
+ "loss": 1.3804,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.8869179600886918,
293
+ "grad_norm": 0.2795542776584625,
294
+ "learning_rate": 0.0009818750936421894,
295
+ "loss": 1.3943,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.9090909090909091,
300
+ "grad_norm": 0.2589350938796997,
301
+ "learning_rate": 0.0009797464868072487,
302
+ "loss": 1.384,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.9312638580931264,
307
+ "grad_norm": 0.9124072790145874,
308
+ "learning_rate": 0.000977502300205793,
309
+ "loss": 1.3689,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.9534368070953437,
314
+ "grad_norm": 0.3424608111381531,
315
+ "learning_rate": 0.0009751430745037169,
316
+ "loss": 1.3664,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.975609756097561,
321
+ "grad_norm": 0.25026756525039673,
322
+ "learning_rate": 0.0009726693780819534,
323
+ "loss": 1.3913,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.9977827050997783,
328
+ "grad_norm": 0.4633461833000183,
329
+ "learning_rate": 0.0009700818068995407,
330
+ "loss": 1.3936,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.9977827050997783,
335
+ "eval_loss": 1.791922926902771,
336
+ "eval_runtime": 0.3732,
337
+ "eval_samples_per_second": 2.679,
338
+ "eval_steps_per_second": 2.679,
339
  "step": 225
340
  },
341
  {
342
  "epoch": 1.0199556541019956,
343
+ "grad_norm": 0.2957637310028076,
344
+ "learning_rate": 0.0009673809843500446,
345
+ "loss": 1.3285,
346
  "step": 230
347
  },
348
  {
349
  "epoch": 1.042128603104213,
350
+ "grad_norm": 0.29558026790618896,
351
+ "learning_rate": 0.0009645675611113716,
352
+ "loss": 1.333,
353
  "step": 235
354
  },
355
  {
356
  "epoch": 1.06430155210643,
357
+ "grad_norm": 0.30888795852661133,
358
+ "learning_rate": 0.0009616422149890085,
359
+ "loss": 1.3411,
360
  "step": 240
361
  },
362
  {
363
  "epoch": 1.0864745011086474,
364
+ "grad_norm": 2.1240732669830322,
365
+ "learning_rate": 0.0009586056507527265,
366
+ "loss": 1.3353,
367
  "step": 245
368
  },
369
  {
370
  "epoch": 1.1086474501108647,
371
+ "grad_norm": 0.4370739758014679,
372
+ "learning_rate": 0.0009554585999667896,
373
+ "loss": 1.3688,
374
  "step": 250
375
  },
376
  {
377
  "epoch": 1.130820399113082,
378
+ "grad_norm": 0.5616880059242249,
379
+ "learning_rate": 0.0009522018208137067,
380
+ "loss": 1.3541,
381
  "step": 255
382
  },
383
  {
384
  "epoch": 1.1529933481152994,
385
+ "grad_norm": 0.31425368785858154,
386
+ "learning_rate": 0.0009488360979115719,
387
+ "loss": 1.3366,
388
  "step": 260
389
  },
390
  {
391
  "epoch": 1.1751662971175167,
392
+ "grad_norm": 0.2877110242843628,
393
+ "learning_rate": 0.0009453622421250352,
394
+ "loss": 1.3446,
395
  "step": 265
396
  },
397
  {
398
  "epoch": 1.1973392461197339,
399
+ "grad_norm": 0.32708439230918884,
400
+ "learning_rate": 0.0009417810903699507,
401
+ "loss": 1.3453,
402
  "step": 270
403
  },
404
  {
405
  "epoch": 1.2195121951219512,
406
+ "grad_norm": 0.24861423671245575,
407
+ "learning_rate": 0.000938093505411748,
408
+ "loss": 1.3226,
409
  "step": 275
410
  },
411
  {
412
  "epoch": 1.2416851441241685,
413
+ "grad_norm": 0.2731972634792328,
414
+ "learning_rate": 0.0009343003756575757,
415
+ "loss": 1.3322,
416
  "step": 280
417
  },
418
  {
419
  "epoch": 1.2638580931263859,
420
+ "grad_norm": 0.3049946129322052,
421
+ "learning_rate": 0.000930402614942268,
422
+ "loss": 1.3487,
423
  "step": 285
424
  },
425
  {
426
  "epoch": 1.2860310421286032,
427
+ "grad_norm": 0.27978989481925964,
428
+ "learning_rate": 0.0009264011623081859,
429
+ "loss": 1.3274,
430
  "step": 290
431
  },
432
  {
433
  "epoch": 1.3082039911308203,
434
+ "grad_norm": 0.9313792586326599,
435
+ "learning_rate": 0.0009222969817789828,
436
+ "loss": 1.3204,
437
  "step": 295
438
  },
439
  {
440
  "epoch": 1.3303769401330376,
441
+ "grad_norm": 0.2632613778114319,
442
+ "learning_rate": 0.0009180910621273555,
443
+ "loss": 1.3382,
444
  "step": 300
445
  },
446
  {
447
  "epoch": 1.352549889135255,
448
+ "grad_norm": 0.35932329297065735,
449
+ "learning_rate": 0.0009137844166368287,
450
+ "loss": 1.3397,
451
  "step": 305
452
  },
453
  {
454
  "epoch": 1.3747228381374723,
455
+ "grad_norm": 0.31215277314186096,
456
+ "learning_rate": 0.0009093780828576379,
457
+ "loss": 1.3241,
458
  "step": 310
459
  },
460
  {
461
  "epoch": 1.3968957871396896,
462
+ "grad_norm": 1.306994080543518,
463
+ "learning_rate": 0.0009048731223567636,
464
+ "loss": 1.3411,
465
  "step": 315
466
  },
467
  {
468
  "epoch": 1.4190687361419068,
469
+ "grad_norm": 0.7964722514152527,
470
+ "learning_rate": 0.0009002706204621802,
471
+ "loss": 1.3433,
472
  "step": 320
473
  },
474
  {
475
  "epoch": 1.441241685144124,
476
+ "grad_norm": 0.32706671953201294,
477
+ "learning_rate": 0.0008955716860013812,
478
+ "loss": 1.3231,
479
  "step": 325
480
  },
481
  {
482
  "epoch": 1.4634146341463414,
483
+ "grad_norm": 0.2626619338989258,
484
+ "learning_rate": 0.0008907774510342412,
485
+ "loss": 1.333,
486
  "step": 330
487
  },
488
  {
489
  "epoch": 1.4855875831485588,
490
+ "grad_norm": 0.2566235363483429,
491
+ "learning_rate": 0.0008858890705802829,
492
+ "loss": 1.3289,
493
  "step": 335
494
  },
495
  {
496
  "epoch": 1.507760532150776,
497
+ "grad_norm": 0.22669324278831482,
498
+ "learning_rate": 0.0008809077223404109,
499
+ "loss": 1.3345,
500
  "step": 340
501
  },
502
  {
503
  "epoch": 1.5299334811529932,
504
+ "grad_norm": 0.24277335405349731,
505
+ "learning_rate": 0.0008758346064131824,
506
+ "loss": 1.3258,
507
  "step": 345
508
  },
509
  {
510
  "epoch": 1.5521064301552108,
511
+ "grad_norm": 0.23311229050159454,
512
+ "learning_rate": 0.0008706709450056802,
513
+ "loss": 1.3208,
514
  "step": 350
515
  },
516
  {
517
  "epoch": 1.5742793791574279,
518
+ "grad_norm": 0.3494793176651001,
519
+ "learning_rate": 0.0008654179821390621,
520
+ "loss": 1.3253,
521
  "step": 355
522
  },
523
  {
524
  "epoch": 1.5964523281596452,
525
+ "grad_norm": 0.22604632377624512,
526
+ "learning_rate": 0.0008600769833488522,
527
+ "loss": 1.3244,
528
  "step": 360
529
  },
530
  {
531
  "epoch": 1.6186252771618626,
532
+ "grad_norm": 0.26579421758651733,
533
+ "learning_rate": 0.0008546492353800504,
534
+ "loss": 1.3256,
535
  "step": 365
536
  },
537
  {
538
  "epoch": 1.6407982261640797,
539
+ "grad_norm": 0.2301110476255417,
540
+ "learning_rate": 0.000849136045877132,
541
+ "loss": 1.3383,
542
  "step": 370
543
  },
544
  {
545
  "epoch": 1.6629711751662972,
546
+ "grad_norm": 0.22868593037128448,
547
+ "learning_rate": 0.0008435387430690114,
548
+ "loss": 1.3194,
549
  "step": 375
550
  },
551
  {
552
  "epoch": 1.6851441241685143,
553
+ "grad_norm": 0.28906944394111633,
554
+ "learning_rate": 0.0008378586754490483,
555
+ "loss": 1.3196,
556
  "step": 380
557
  },
558
  {
559
  "epoch": 1.7073170731707317,
560
+ "grad_norm": 0.22416484355926514,
561
+ "learning_rate": 0.0008320972114501697,
562
+ "loss": 1.3281,
563
  "step": 385
564
  },
565
  {
566
  "epoch": 1.729490022172949,
567
+ "grad_norm": 0.2212500125169754,
568
+ "learning_rate": 0.0008262557391151904,
569
+ "loss": 1.3166,
570
  "step": 390
571
  },
572
  {
573
  "epoch": 1.7516629711751663,
574
+ "grad_norm": 0.24597449600696564,
575
+ "learning_rate": 0.0008203356657624068,
576
+ "loss": 1.3147,
577
  "step": 395
578
  },
579
  {
580
  "epoch": 1.7738359201773837,
581
+ "grad_norm": 0.20681482553482056,
582
+ "learning_rate": 0.0008143384176465486,
583
+ "loss": 1.3207,
584
  "step": 400
585
  },
586
  {
587
  "epoch": 1.7960088691796008,
588
+ "grad_norm": 0.22514577209949493,
589
+ "learning_rate": 0.0008082654396151675,
590
+ "loss": 1.3256,
591
  "step": 405
592
  },
593
  {
594
  "epoch": 1.8181818181818183,
595
+ "grad_norm": 0.2058994621038437,
596
+ "learning_rate": 0.0008021181947605473,
597
+ "loss": 1.3051,
598
  "step": 410
599
  },
600
  {
601
  "epoch": 1.8403547671840355,
602
+ "grad_norm": 0.24868044257164001,
603
+ "learning_rate": 0.0007958981640672172,
604
+ "loss": 1.3133,
605
  "step": 415
606
  },
607
  {
608
  "epoch": 1.8625277161862528,
609
+ "grad_norm": 0.219014972448349,
610
+ "learning_rate": 0.0007896068460551562,
611
+ "loss": 1.3016,
612
  "step": 420
613
  },
614
  {
615
  "epoch": 1.8847006651884701,
616
+ "grad_norm": 0.2263939380645752,
617
+ "learning_rate": 0.0007832457564187715,
618
+ "loss": 1.3269,
619
  "step": 425
620
  },
621
  {
622
  "epoch": 1.9068736141906872,
623
+ "grad_norm": 0.22888123989105225,
624
+ "learning_rate": 0.0007768164276617396,
625
+ "loss": 1.3297,
626
  "step": 430
627
  },
628
  {
629
  "epoch": 1.9290465631929048,
630
+ "grad_norm": 0.21448783576488495,
631
+ "learning_rate": 0.0007703204087277988,
632
+ "loss": 1.3042,
633
  "step": 435
634
  },
635
  {
636
  "epoch": 1.951219512195122,
637
+ "grad_norm": 0.217611163854599,
638
+ "learning_rate": 0.0007637592646275793,
639
+ "loss": 1.3171,
640
  "step": 440
641
  },
642
  {
643
  "epoch": 1.9733924611973392,
644
+ "grad_norm": 0.2066190540790558,
645
+ "learning_rate": 0.0007571345760615634,
646
+ "loss": 1.3131,
647
  "step": 445
648
  },
649
  {
650
  "epoch": 1.9955654101995566,
651
+ "grad_norm": 0.2094324827194214,
652
+ "learning_rate": 0.0007504479390392661,
653
+ "loss": 1.31,
654
  "step": 450
655
  },
656
  {
657
  "epoch": 2.0,
658
+ "eval_loss": 1.7869027853012085,
659
+ "eval_runtime": 0.3369,
660
+ "eval_samples_per_second": 2.969,
661
+ "eval_steps_per_second": 2.969,
662
  "step": 451
663
  },
664
  {
665
  "epoch": 2.0177383592017737,
666
+ "grad_norm": 0.2380608320236206,
667
+ "learning_rate": 0.0007437009644947268,
668
+ "loss": 1.2636,
669
  "step": 455
670
  },
671
  {
672
  "epoch": 2.0399113082039912,
673
+ "grad_norm": 0.226850226521492,
674
+ "learning_rate": 0.0007368952778984051,
675
+ "loss": 1.2298,
676
  "step": 460
677
  },
678
  {
679
  "epoch": 2.0620842572062084,
680
+ "grad_norm": 0.23007981479167938,
681
+ "learning_rate": 0.0007300325188655761,
682
+ "loss": 1.2459,
683
  "step": 465
684
  },
685
  {
686
  "epoch": 2.084257206208426,
687
+ "grad_norm": 0.21383607387542725,
688
+ "learning_rate": 0.0007231143407613156,
689
+ "loss": 1.2328,
690
  "step": 470
691
  },
692
  {
693
  "epoch": 2.106430155210643,
694
+ "grad_norm": 0.22145338356494904,
695
+ "learning_rate": 0.0007161424103021752,
696
+ "loss": 1.2326,
697
  "step": 475
698
  },
699
  {
700
  "epoch": 2.12860310421286,
701
+ "grad_norm": 0.26510247588157654,
702
+ "learning_rate": 0.0007091184071546384,
703
+ "loss": 1.2377,
704
  "step": 480
705
  },
706
  {
707
  "epoch": 2.1507760532150777,
708
+ "grad_norm": 0.23144353926181793,
709
+ "learning_rate": 0.0007020440235304592,
710
+ "loss": 1.2195,
711
  "step": 485
712
  },
713
  {
714
  "epoch": 2.172949002217295,
715
+ "grad_norm": 0.22692181169986725,
716
+ "learning_rate": 0.000694920963778976,
717
+ "loss": 1.2181,
718
  "step": 490
719
  },
720
  {
721
  "epoch": 2.1951219512195124,
722
+ "grad_norm": 0.2270960956811905,
723
+ "learning_rate": 0.0006877509439765037,
724
+ "loss": 1.2444,
725
  "step": 495
726
  },
727
  {
728
  "epoch": 2.2172949002217295,
729
+ "grad_norm": 0.22735467553138733,
730
+ "learning_rate": 0.0006805356915128977,
731
+ "loss": 1.2385,
732
  "step": 500
733
  },
734
  {
735
  "epoch": 2.2394678492239466,
736
+ "grad_norm": 0.21610836684703827,
737
+ "learning_rate": 0.0006732769446753953,
738
+ "loss": 1.2476,
739
  "step": 505
740
  },
741
  {
742
  "epoch": 2.261640798226164,
743
+ "grad_norm": 0.4567781686782837,
744
+ "learning_rate": 0.0006659764522298296,
745
+ "loss": 1.2411,
746
  "step": 510
747
  },
748
  {
749
  "epoch": 2.2838137472283813,
750
+ "grad_norm": 0.23705683648586273,
751
+ "learning_rate": 0.0006586359729993199,
752
+ "loss": 1.2334,
753
  "step": 515
754
  },
755
  {
756
  "epoch": 2.305986696230599,
757
+ "grad_norm": 0.22446921467781067,
758
+ "learning_rate": 0.0006512572754405379,
759
+ "loss": 1.2342,
760
  "step": 520
761
  },
762
  {
763
  "epoch": 2.328159645232816,
764
+ "grad_norm": 0.2318691909313202,
765
+ "learning_rate": 0.0006438421372176556,
766
+ "loss": 1.2379,
767
  "step": 525
768
  },
769
  {
770
  "epoch": 2.3503325942350335,
771
+ "grad_norm": 0.2584494650363922,
772
+ "learning_rate": 0.0006363923447740718,
773
+ "loss": 1.2371,
774
  "step": 530
775
  },
776
  {
777
  "epoch": 2.3725055432372506,
778
+ "grad_norm": 0.2136611044406891,
779
+ "learning_rate": 0.0006289096929020253,
780
+ "loss": 1.2497,
781
  "step": 535
782
  },
783
  {
784
  "epoch": 2.3946784922394677,
785
+ "grad_norm": 0.22401590645313263,
786
+ "learning_rate": 0.000621395984310197,
787
+ "loss": 1.2508,
788
  "step": 540
789
  },
790
  {
791
  "epoch": 2.4168514412416853,
792
+ "grad_norm": 0.21865229308605194,
793
+ "learning_rate": 0.0006138530291894032,
794
+ "loss": 1.2494,
795
  "step": 545
796
  },
797
  {
798
  "epoch": 2.4390243902439024,
799
+ "grad_norm": 0.23078739643096924,
800
+ "learning_rate": 0.0006062826447764884,
801
+ "loss": 1.2474,
802
  "step": 550
803
  },
804
  {
805
  "epoch": 2.4611973392461195,
806
+ "grad_norm": 0.24809886515140533,
807
+ "learning_rate": 0.0005986866549165184,
808
+ "loss": 1.2472,
809
  "step": 555
810
  },
811
  {
812
  "epoch": 2.483370288248337,
813
+ "grad_norm": 0.23055674135684967,
814
+ "learning_rate": 0.000591066889623383,
815
+ "loss": 1.23,
816
  "step": 560
817
  },
818
  {
819
  "epoch": 2.505543237250554,
820
+ "grad_norm": 0.22716328501701355,
821
+ "learning_rate": 0.000583425184638912,
822
+ "loss": 1.2495,
823
  "step": 565
824
  },
825
  {
826
  "epoch": 2.5277161862527717,
827
+ "grad_norm": 0.21761415898799896,
828
+ "learning_rate": 0.0005757633809906107,
829
+ "loss": 1.2448,
830
  "step": 570
831
  },
832
  {
833
  "epoch": 2.549889135254989,
834
+ "grad_norm": 0.2366757094860077,
835
+ "learning_rate": 0.0005680833245481234,
836
+ "loss": 1.2481,
837
  "step": 575
838
  },
839
  {
840
  "epoch": 2.5720620842572064,
841
+ "grad_norm": 0.23113702237606049,
842
+ "learning_rate": 0.0005603868655785279,
843
+ "loss": 1.2422,
844
  "step": 580
845
  },
846
  {
847
  "epoch": 2.5942350332594235,
848
+ "grad_norm": 0.2284475862979889,
849
+ "learning_rate": 0.0005526758583005735,
850
+ "loss": 1.2354,
851
  "step": 585
852
  },
853
  {
854
  "epoch": 2.6164079822616406,
855
+ "grad_norm": 0.23871304094791412,
856
+ "learning_rate": 0.0005449521604379652,
857
+ "loss": 1.2573,
858
  "step": 590
859
  },
860
  {
861
  "epoch": 2.638580931263858,
862
+ "grad_norm": 0.2440497726202011,
863
+ "learning_rate": 0.0005372176327718029,
864
+ "loss": 1.2634,
865
  "step": 595
866
  },
867
  {
868
  "epoch": 2.6607538802660753,
869
+ "grad_norm": 0.23553554713726044,
870
+ "learning_rate": 0.0005294741386922863,
871
+ "loss": 1.2494,
872
  "step": 600
873
  },
874
  {
875
  "epoch": 2.682926829268293,
876
+ "grad_norm": 0.22893203794956207,
877
+ "learning_rate": 0.000521723543749789,
878
+ "loss": 1.2317,
879
  "step": 605
880
  },
881
  {
882
  "epoch": 2.70509977827051,
883
+ "grad_norm": 0.21567869186401367,
884
+ "learning_rate": 0.0005139677152054135,
885
+ "loss": 1.2267,
886
  "step": 610
887
  },
888
  {
889
  "epoch": 2.7272727272727275,
890
+ "grad_norm": 0.21539655327796936,
891
+ "learning_rate": 0.000506208521581133,
892
+ "loss": 1.2314,
893
  "step": 615
894
  },
895
  {
896
  "epoch": 2.7494456762749446,
897
+ "grad_norm": 0.263122022151947,
898
+ "learning_rate": 0.0004984478322096308,
899
+ "loss": 1.2504,
900
  "step": 620
901
  },
902
  {
903
  "epoch": 2.7716186252771617,
904
+ "grad_norm": 0.21988996863365173,
905
+ "learning_rate": 0.0004906875167839433,
906
+ "loss": 1.2424,
907
  "step": 625
908
  },
909
  {
910
  "epoch": 2.7937915742793793,
911
+ "grad_norm": 0.22908321022987366,
912
+ "learning_rate": 0.00048292944490701606,
913
+ "loss": 1.2374,
914
  "step": 630
915
  },
916
  {
917
  "epoch": 2.8159645232815964,
918
+ "grad_norm": 0.21693512797355652,
919
+ "learning_rate": 0.00047517548564128293,
920
+ "loss": 1.2368,
921
  "step": 635
922
  },
923
  {
924
  "epoch": 2.8381374722838135,
925
+ "grad_norm": 0.22369663417339325,
926
+ "learning_rate": 0.00046742750705837356,
927
+ "loss": 1.2227,
928
  "step": 640
929
  },
930
  {
931
  "epoch": 2.860310421286031,
932
+ "grad_norm": 0.21551957726478577,
933
+ "learning_rate": 0.0004596873757890612,
934
+ "loss": 1.2353,
935
  "step": 645
936
  },
937
  {
938
  "epoch": 2.882483370288248,
939
+ "grad_norm": 0.21659965813159943,
940
+ "learning_rate": 0.00045195695657355636,
941
+ "loss": 1.242,
942
  "step": 650
943
  },
944
  {
945
  "epoch": 2.9046563192904657,
946
+ "grad_norm": 0.2131386399269104,
947
+ "learning_rate": 0.00044423811181225727,
948
+ "loss": 1.2372,
949
  "step": 655
950
  },
951
  {
952
  "epoch": 2.926829268292683,
953
+ "grad_norm": 0.21243086457252502,
954
+ "learning_rate": 0.0004365327011170628,
955
+ "loss": 1.2492,
956
  "step": 660
957
  },
958
  {
959
  "epoch": 2.9490022172949004,
960
+ "grad_norm": 0.2212466597557068,
961
+ "learning_rate": 0.0004288425808633575,
962
+ "loss": 1.2548,
963
  "step": 665
964
  },
965
  {
966
  "epoch": 2.9711751662971175,
967
+ "grad_norm": 0.21596314013004303,
968
+ "learning_rate": 0.0004211696037427772,
969
+ "loss": 1.231,
970
  "step": 670
971
  },
972
  {
973
  "epoch": 2.9933481152993346,
974
+ "grad_norm": 0.20963598787784576,
975
+ "learning_rate": 0.0004135156183168613,
976
+ "loss": 1.2416,
977
  "step": 675
978
  },
979
  {
980
  "epoch": 2.9977827050997785,
981
+ "eval_loss": 1.824194312095642,
982
+ "eval_runtime": 0.3515,
983
+ "eval_samples_per_second": 2.845,
984
+ "eval_steps_per_second": 2.845,
985
  "step": 676
986
  },
987
  {
988
  "epoch": 3.015521064301552,
989
+ "grad_norm": 0.23751798272132874,
990
+ "learning_rate": 0.0004058824685716997,
991
+ "loss": 1.1799,
992
  "step": 680
993
  },
994
  {
995
  "epoch": 3.0376940133037693,
996
+ "grad_norm": 0.22644291818141937,
997
+ "learning_rate": 0.0003982719934736832,
998
+ "loss": 1.1569,
999
  "step": 685
1000
  },
1001
  {
1002
  "epoch": 3.059866962305987,
1003
+ "grad_norm": 0.24408124387264252,
1004
+ "learning_rate": 0.0003906860265264622,
1005
+ "loss": 1.1543,
1006
  "step": 690
1007
  },
1008
  {
1009
  "epoch": 3.082039911308204,
1010
+ "grad_norm": 0.23074336349964142,
1011
+ "learning_rate": 0.00038312639532922245,
1012
+ "loss": 1.1437,
1013
  "step": 695
1014
  },
1015
  {
1016
  "epoch": 3.104212860310421,
1017
+ "grad_norm": 0.2394222617149353,
1018
+ "learning_rate": 0.00037559492113638205,
1019
+ "loss": 1.1394,
1020
  "step": 700
1021
  },
1022
  {
1023
  "epoch": 3.1263858093126387,
1024
+ "grad_norm": 0.23385775089263916,
1025
+ "learning_rate": 0.00036809341841881815,
1026
+ "loss": 1.1507,
1027
  "step": 705
1028
  },
1029
  {
1030
  "epoch": 3.1485587583148558,
1031
+ "grad_norm": 0.23921357095241547,
1032
+ "learning_rate": 0.00036062369442672724,
1033
+ "loss": 1.1554,
1034
  "step": 710
1035
  },
1036
  {
1037
  "epoch": 3.1707317073170733,
1038
+ "grad_norm": 0.2213825136423111,
1039
+ "learning_rate": 0.00035318754875422585,
1040
+ "loss": 1.1523,
1041
  "step": 715
1042
  },
1043
  {
1044
  "epoch": 3.1929046563192904,
1045
+ "grad_norm": 0.24610434472560883,
1046
+ "learning_rate": 0.0003457867729057942,
1047
+ "loss": 1.1484,
1048
  "step": 720
1049
  },
1050
  {
1051
  "epoch": 3.2150776053215075,
1052
+ "grad_norm": 0.23285223543643951,
1053
+ "learning_rate": 0.0003384231498646706,
1054
+ "loss": 1.1599,
1055
  "step": 725
1056
  },
1057
  {
1058
  "epoch": 3.237250554323725,
1059
+ "grad_norm": 0.23643791675567627,
1060
+ "learning_rate": 0.0003310984536632975,
1061
+ "loss": 1.1655,
1062
  "step": 730
1063
  },
1064
  {
1065
  "epoch": 3.259423503325942,
1066
+ "grad_norm": 0.23265090584754944,
1067
+ "learning_rate": 0.0003238144489559248,
1068
+ "loss": 1.1772,
1069
  "step": 735
1070
  },
1071
  {
1072
  "epoch": 3.2815964523281598,
1073
+ "grad_norm": 0.23318685591220856,
1074
+ "learning_rate": 0.00031657289059347184,
1075
+ "loss": 1.1607,
1076
  "step": 740
1077
  },
1078
  {
1079
  "epoch": 3.303769401330377,
1080
+ "grad_norm": 0.23179373145103455,
1081
+ "learning_rate": 0.00030937552320075114,
1082
+ "loss": 1.1541,
1083
  "step": 745
1084
  },
1085
  {
1086
  "epoch": 3.3259423503325944,
1087
+ "grad_norm": 0.22854293882846832,
1088
+ "learning_rate": 0.0003022240807561569,
1089
+ "loss": 1.1573,
1090
  "step": 750
1091
  },
1092
  {
1093
  "epoch": 3.3481152993348116,
1094
+ "grad_norm": 0.24048267304897308,
1095
+ "learning_rate": 0.0002951202861739173,
1096
+ "loss": 1.134,
1097
  "step": 755
1098
  },
1099
  {
1100
  "epoch": 3.3702882483370287,
1101
+ "grad_norm": 0.24298301339149475,
1102
+ "learning_rate": 0.0002880658508890125,
1103
+ "loss": 1.1615,
1104
  "step": 760
1105
  },
1106
  {
1107
  "epoch": 3.3924611973392462,
1108
+ "grad_norm": 0.24603115022182465,
1109
+ "learning_rate": 0.0002810624744448588,
1110
+ "loss": 1.1458,
1111
  "step": 765
1112
  },
1113
  {
1114
  "epoch": 3.4146341463414633,
1115
+ "grad_norm": 0.2374211847782135,
1116
+ "learning_rate": 0.000274111844083857,
1117
+ "loss": 1.1604,
1118
  "step": 770
1119
  },
1120
  {
1121
  "epoch": 3.436807095343681,
1122
+ "grad_norm": 0.23238298296928406,
1123
+ "learning_rate": 0.0002672156343409053,
1124
+ "loss": 1.1525,
1125
  "step": 775
1126
  },
1127
  {
1128
  "epoch": 3.458980044345898,
1129
+ "grad_norm": 0.24065588414669037,
1130
+ "learning_rate": 0.00026037550663997176,
1131
+ "loss": 1.1626,
1132
  "step": 780
1133
  },
1134
  {
1135
  "epoch": 3.481152993348115,
1136
+ "grad_norm": 0.23232793807983398,
1137
+ "learning_rate": 0.00025359310889382737,
1138
+ "loss": 1.1567,
1139
  "step": 785
1140
  },
1141
  {
1142
  "epoch": 3.5033259423503327,
1143
+ "grad_norm": 0.31952565908432007,
1144
+ "learning_rate": 0.0002468700751070346,
1145
+ "loss": 1.1441,
1146
  "step": 790
1147
  },
1148
  {
1149
  "epoch": 3.52549889135255,
1150
+ "grad_norm": 0.25355201959609985,
1151
+ "learning_rate": 0.00024020802498228333,
1152
+ "loss": 1.1578,
1153
  "step": 795
1154
  },
1155
  {
1156
  "epoch": 3.5476718403547673,
1157
+ "grad_norm": 0.23360273241996765,
1158
+ "learning_rate": 0.00023360856353017617,
1159
+ "loss": 1.1624,
1160
  "step": 800
1161
  },
1162
  {
1163
  "epoch": 3.5698447893569845,
1164
+ "grad_norm": 0.24496670067310333,
1165
+ "learning_rate": 0.00022707328068255166,
1166
+ "loss": 1.1608,
1167
  "step": 805
1168
  },
1169
  {
1170
  "epoch": 3.5920177383592016,
1171
+ "grad_norm": 0.23215603828430176,
1172
+ "learning_rate": 0.00022060375090944025,
1173
+ "loss": 1.1561,
1174
  "step": 810
1175
  },
1176
  {
1177
  "epoch": 3.614190687361419,
1178
+ "grad_norm": 0.24374577403068542,
1179
+ "learning_rate": 0.00021420153283974535,
1180
+ "loss": 1.1582,
1181
  "step": 815
1182
  },
1183
  {
1184
  "epoch": 3.6363636363636362,
1185
+ "grad_norm": 0.23796550929546356,
1186
+ "learning_rate": 0.00020786816888574095,
1187
+ "loss": 1.1714,
1188
  "step": 820
1189
  },
1190
  {
1191
  "epoch": 3.658536585365854,
1192
+ "grad_norm": 0.33616843819618225,
1193
+ "learning_rate": 0.00020160518487147579,
1194
+ "loss": 1.155,
1195
  "step": 825
1196
  },
1197
  {
1198
  "epoch": 3.680709534368071,
1199
+ "grad_norm": 0.23069876432418823,
1200
+ "learning_rate": 0.00019541408966517566,
1201
+ "loss": 1.1552,
1202
  "step": 830
1203
  },
1204
  {
1205
  "epoch": 3.7028824833702885,
1206
+ "grad_norm": 0.23669025301933289,
1207
+ "learning_rate": 0.00018929637481572713,
1208
+ "loss": 1.1598,
1209
  "step": 835
1210
  },
1211
  {
1212
  "epoch": 3.7250554323725056,
1213
+ "grad_norm": 0.23340646922588348,
1214
+ "learning_rate": 0.0001832535141933373,
1215
+ "loss": 1.1478,
1216
  "step": 840
1217
  },
1218
  {
1219
  "epoch": 3.7472283813747227,
1220
+ "grad_norm": 0.24212823808193207,
1221
+ "learning_rate": 0.00017728696363445117,
1222
+ "loss": 1.1521,
1223
  "step": 845
1224
  },
1225
  {
1226
  "epoch": 3.7694013303769403,
1227
+ "grad_norm": 0.243617445230484,
1228
+ "learning_rate": 0.0001713981605910137,
1229
+ "loss": 1.1534,
1230
  "step": 850
1231
  },
1232
  {
1233
  "epoch": 3.7915742793791574,
1234
+ "grad_norm": 0.23238743841648102,
1235
+ "learning_rate": 0.0001655885237841611,
1236
+ "loss": 1.1595,
1237
  "step": 855
1238
  },
1239
  {
1240
  "epoch": 3.8137472283813745,
1241
+ "grad_norm": 0.2364315241575241,
1242
+ "learning_rate": 0.00015985945286242452,
1243
+ "loss": 1.1499,
1244
  "step": 860
1245
  },
1246
  {
1247
  "epoch": 3.835920177383592,
1248
+ "grad_norm": 0.23698101937770844,
1249
+ "learning_rate": 0.00015421232806452916,
1250
+ "loss": 1.1564,
1251
  "step": 865
1252
  },
1253
  {
1254
  "epoch": 3.858093126385809,
1255
+ "grad_norm": 0.2341204732656479,
1256
+ "learning_rate": 0.00014864850988687017,
1257
+ "loss": 1.1575,
1258
  "step": 870
1259
  },
1260
  {
1261
  "epoch": 3.8802660753880267,
1262
+ "grad_norm": 0.2593097984790802,
1263
+ "learning_rate": 0.0001431693387557424,
1264
+ "loss": 1.1644,
1265
  "step": 875
1266
  },
1267
  {
1268
  "epoch": 3.902439024390244,
1269
+ "grad_norm": 0.24039191007614136,
1270
+ "learning_rate": 0.0001377761347044079,
1271
+ "loss": 1.1549,
1272
  "step": 880
1273
  },
1274
  {
1275
  "epoch": 3.9246119733924614,
1276
+ "grad_norm": 0.23400068283081055,
1277
+ "learning_rate": 0.00013247019705507596,
1278
+ "loss": 1.1712,
1279
  "step": 885
1280
  },
1281
  {
1282
  "epoch": 3.9467849223946785,
1283
+ "grad_norm": 0.23080703616142273,
1284
+ "learning_rate": 0.00012725280410587166,
1285
+ "loss": 1.1687,
1286
  "step": 890
1287
  },
1288
  {
1289
  "epoch": 3.9689578713968956,
1290
+ "grad_norm": 0.23793019354343414,
1291
+ "learning_rate": 0.00012212521282287093,
1292
+ "loss": 1.1562,
1293
  "step": 895
1294
  },
1295
  {
1296
  "epoch": 3.991130820399113,
1297
+ "grad_norm": 0.254978746175766,
1298
+ "learning_rate": 0.00011708865853727369,
1299
+ "loss": 1.1447,
1300
  "step": 900
1301
  },
1302
  {
1303
  "epoch": 4.0,
1304
+ "eval_loss": 1.9029209613800049,
1305
+ "eval_runtime": 0.3379,
1306
+ "eval_samples_per_second": 2.959,
1307
+ "eval_steps_per_second": 2.959,
1308
  "step": 902
1309
  },
1310
  {
1311
  "epoch": 4.013303769401331,
1312
+ "grad_norm": 0.22599312663078308,
1313
+ "learning_rate": 0.00011214435464779005,
1314
+ "loss": 1.1088,
1315
  "step": 905
1316
  },
1317
  {
1318
  "epoch": 4.035476718403547,
1319
+ "grad_norm": 0.26651766896247864,
1320
+ "learning_rate": 0.00010729349232831092,
1321
+ "loss": 1.0774,
1322
  "step": 910
1323
  },
1324
  {
1325
  "epoch": 4.057649667405765,
1326
+ "grad_norm": 0.2449249029159546,
1327
+ "learning_rate": 0.00010253724024093103,
1328
+ "loss": 1.0788,
1329
  "step": 915
1330
  },
1331
  {
1332
  "epoch": 4.0798226164079825,
1333
+ "grad_norm": 0.24662241339683533,
1334
+ "learning_rate": 9.787674425439719e-05,
1335
+ "loss": 1.0857,
1336
  "step": 920
1337
  },
1338
  {
1339
  "epoch": 4.101995565410199,
1340
+ "grad_norm": 0.24599789083003998,
1341
+ "learning_rate": 9.331312716804791e-05,
1342
+ "loss": 1.0937,
1343
  "step": 925
1344
  },
1345
  {
1346
  "epoch": 4.124168514412417,
1347
+ "grad_norm": 0.25388607382774353,
1348
+ "learning_rate": 8.884748844130986e-05,
1349
+ "loss": 1.0834,
1350
  "step": 930
1351
  },
1352
  {
1353
  "epoch": 4.146341463414634,
1354
+ "grad_norm": 0.24923333525657654,
1355
+ "learning_rate": 8.448090392881796e-05,
1356
+ "loss": 1.0861,
1357
  "step": 935
1358
  },
1359
  {
1360
  "epoch": 4.168514412416852,
1361
+ "grad_norm": 0.24948322772979736,
1362
+ "learning_rate": 8.021442562122194e-05,
1363
+ "loss": 1.0852,
1364
  "step": 940
1365
  },
1366
  {
1367
  "epoch": 4.1906873614190685,
1368
+ "grad_norm": 0.25804150104522705,
1369
+ "learning_rate": 7.604908139174255e-05,
1370
+ "loss": 1.0998,
1371
  "step": 945
1372
  },
1373
  {
1374
  "epoch": 4.212860310421286,
1375
+ "grad_norm": 0.2608419954776764,
1376
+ "learning_rate": 7.198587474853863e-05,
1377
+ "loss": 1.081,
1378
  "step": 950
1379
  },
1380
  {
1381
  "epoch": 4.235033259423504,
1382
+ "grad_norm": 0.2649877965450287,
1383
+ "learning_rate": 6.802578459294235e-05,
1384
+ "loss": 1.0946,
1385
  "step": 955
1386
  },
1387
  {
1388
  "epoch": 4.25720620842572,
1389
+ "grad_norm": 0.25245440006256104,
1390
+ "learning_rate": 6.416976498362431e-05,
1391
+ "loss": 1.0893,
1392
  "step": 960
1393
  },
1394
  {
1395
  "epoch": 4.279379157427938,
1396
+ "grad_norm": 0.2566092908382416,
1397
+ "learning_rate": 6.041874490674415e-05,
1398
+ "loss": 1.0925,
1399
  "step": 965
1400
  },
1401
  {
1402
  "epoch": 4.301552106430155,
1403
+ "grad_norm": 0.25820818543434143,
1404
+ "learning_rate": 5.6773628052139036e-05,
1405
+ "loss": 1.0999,
1406
  "step": 970
1407
  },
1408
  {
1409
  "epoch": 4.323725055432373,
1410
+ "grad_norm": 0.24905657768249512,
1411
+ "learning_rate": 5.3235292595609106e-05,
1412
+ "loss": 1.0727,
1413
  "step": 975
1414
  },
1415
  {
1416
  "epoch": 4.34589800443459,
1417
+ "grad_norm": 0.2517815828323364,
1418
+ "learning_rate": 4.9804590987348854e-05,
1419
+ "loss": 1.0929,
1420
  "step": 980
1421
  },
1422
  {
1423
  "epoch": 4.368070953436807,
1424
+ "grad_norm": 0.25075092911720276,
1425
+ "learning_rate": 4.648234974657578e-05,
1426
+ "loss": 1.1038,
1427
  "step": 985
1428
  },
1429
  {
1430
  "epoch": 4.390243902439025,
1431
+ "grad_norm": 0.25245392322540283,
1432
+ "learning_rate": 4.326936926240682e-05,
1433
+ "loss": 1.0721,
1434
  "step": 990
1435
  },
1436
  {
1437
  "epoch": 4.412416851441241,
1438
+ "grad_norm": 0.2557605504989624,
1439
+ "learning_rate": 4.0166423601029736e-05,
1440
+ "loss": 1.0926,
1441
  "step": 995
1442
  },
1443
  {
1444
  "epoch": 4.434589800443459,
1445
+ "grad_norm": 0.24609370529651642,
1446
+ "learning_rate": 3.717426031921639e-05,
1447
+ "loss": 1.0866,
1448
  "step": 1000
1449
  },
1450
  {
1451
  "epoch": 4.4567627494456765,
1452
+ "grad_norm": 0.2574557363986969,
1453
+ "learning_rate": 3.429360028422307e-05,
1454
+ "loss": 1.0874,
1455
  "step": 1005
1456
  },
1457
  {
1458
  "epoch": 4.478935698447893,
1459
+ "grad_norm": 0.256829172372818,
1460
+ "learning_rate": 3.152513750011921e-05,
1461
+ "loss": 1.092,
1462
  "step": 1010
1463
  },
1464
  {
1465
  "epoch": 4.501108647450111,
1466
+ "grad_norm": 0.25330841541290283,
1467
+ "learning_rate": 2.8869538940589802e-05,
1468
+ "loss": 1.0949,
1469
  "step": 1015
1470
  },
1471
  {
1472
  "epoch": 4.523281596452328,
1473
+ "grad_norm": 0.25692063570022583,
1474
+ "learning_rate": 2.6327444388249076e-05,
1475
+ "loss": 1.0946,
1476
  "step": 1020
1477
  },
1478
  {
1479
  "epoch": 4.545454545454545,
1480
+ "grad_norm": 0.25190240144729614,
1481
+ "learning_rate": 2.3899466280504933e-05,
1482
+ "loss": 1.0901,
1483
  "step": 1025
1484
  },
1485
  {
1486
  "epoch": 4.5676274944567625,
1487
+ "grad_norm": 0.25231993198394775,
1488
+ "learning_rate": 2.158618956201158e-05,
1489
+ "loss": 1.095,
1490
  "step": 1030
1491
  },
1492
  {
1493
  "epoch": 4.58980044345898,
1494
+ "grad_norm": 0.25305166840553284,
1495
+ "learning_rate": 1.9388171543745393e-05,
1496
+ "loss": 1.0747,
1497
  "step": 1035
1498
  },
1499
  {
1500
  "epoch": 4.611973392461198,
1501
+ "grad_norm": 0.2522623538970947,
1502
+ "learning_rate": 1.730594176873851e-05,
1503
+ "loss": 1.0927,
1504
  "step": 1040
1505
  },
1506
  {
1507
  "epoch": 4.634146341463414,
1508
+ "grad_norm": 0.2547568678855896,
1509
+ "learning_rate": 1.5340001884502576e-05,
1510
+ "loss": 1.1009,
1511
  "step": 1045
1512
  },
1513
  {
1514
  "epoch": 4.656319290465632,
1515
+ "grad_norm": 0.25178787112236023,
1516
+ "learning_rate": 1.3490825522172012e-05,
1517
+ "loss": 1.0879,
1518
  "step": 1050
1519
  },
1520
  {
1521
  "epoch": 4.678492239467849,
1522
+ "grad_norm": 0.2568508982658386,
1523
+ "learning_rate": 1.1758858182397692e-05,
1524
+ "loss": 1.0887,
1525
  "step": 1055
1526
  },
1527
  {
1528
  "epoch": 4.700665188470067,
1529
+ "grad_norm": 0.2606378197669983,
1530
+ "learning_rate": 1.014451712801806e-05,
1531
+ "loss": 1.0881,
1532
  "step": 1060
1533
  },
1534
  {
1535
  "epoch": 4.722838137472284,
1536
+ "grad_norm": 0.24862989783287048,
1537
+ "learning_rate": 8.648191283532336e-06,
1538
+ "loss": 1.0887,
1539
  "step": 1065
1540
  },
1541
  {
1542
  "epoch": 4.745011086474501,
1543
+ "grad_norm": 0.24630777537822723,
1544
+ "learning_rate": 7.270241141401568e-06,
1545
+ "loss": 1.0745,
1546
  "step": 1070
1547
  },
1548
  {
1549
  "epoch": 4.767184035476719,
1550
+ "grad_norm": 0.26346486806869507,
1551
+ "learning_rate": 6.010998675199553e-06,
1552
+ "loss": 1.1047,
1553
  "step": 1075
1554
  },
1555
  {
1556
  "epoch": 4.789356984478935,
1557
+ "grad_norm": 0.25308722257614136,
1558
+ "learning_rate": 4.870767259633868e-06,
1559
+ "loss": 1.0981,
1560
  "step": 1080
1561
  },
1562
  {
1563
  "epoch": 4.811529933481153,
1564
+ "grad_norm": 0.2649281322956085,
1565
+ "learning_rate": 3.849821597457892e-06,
1566
+ "loss": 1.0827,
1567
  "step": 1085
1568
  },
1569
  {
1570
  "epoch": 4.8337028824833705,
1571
+ "grad_norm": 0.2485639750957489,
1572
+ "learning_rate": 2.948407653289409e-06,
1573
+ "loss": 1.0849,
1574
  "step": 1090
1575
  },
1576
  {
1577
  "epoch": 4.855875831485587,
1578
+ "grad_norm": 0.25217127799987793,
1579
+ "learning_rate": 2.166742594353288e-06,
1580
+ "loss": 1.0861,
1581
  "step": 1095
1582
  },
1583
  {
1584
  "epoch": 4.878048780487805,
1585
+ "grad_norm": 0.2509874403476715,
1586
+ "learning_rate": 1.5050147381619473e-06,
1587
+ "loss": 1.1002,
1588
  "step": 1100
1589
  },
1590
  {
1591
  "epoch": 4.900221729490022,
1592
+ "grad_norm": 0.2537370026111603,
1593
+ "learning_rate": 9.633835071463092e-07,
1594
+ "loss": 1.0815,
1595
  "step": 1105
1596
  },
1597
  {
1598
  "epoch": 4.922394678492239,
1599
+ "grad_norm": 0.25824105739593506,
1600
+ "learning_rate": 5.419793902477488e-07,
1601
+ "loss": 1.085,
1602
  "step": 1110
1603
  },
1604
  {
1605
  "epoch": 4.9445676274944566,
1606
+ "grad_norm": 0.2534651756286621,
1607
+ "learning_rate": 2.4090391148112736e-07,
1608
+ "loss": 1.0952,
1609
  "step": 1115
1610
  },
1611
  {
1612
  "epoch": 4.966740576496674,
1613
+ "grad_norm": 0.25321847200393677,
1614
+ "learning_rate": 6.022960547563683e-08,
1615
+ "loss": 1.0899,
1616
  "step": 1120
1617
  },
1618
  {
1619
  "epoch": 4.988913525498892,
1620
+ "grad_norm": 0.2509351968765259,
1621
  "learning_rate": 0.0,
1622
+ "loss": 1.098,
1623
  "step": 1125
1624
  },
1625
  {
1626
  "epoch": 4.988913525498892,
1627
+ "eval_loss": 1.9855170249938965,
1628
+ "eval_runtime": 0.3384,
1629
+ "eval_samples_per_second": 2.955,
1630
+ "eval_steps_per_second": 2.955,
1631
  "step": 1125
1632
  },
1633
  {
1634
  "epoch": 4.988913525498892,
1635
  "step": 1125,
1636
  "total_flos": 1.6629843858229821e+18,
1637
+ "train_loss": 1.2764637470245361,
1638
+ "train_runtime": 3608.211,
1639
+ "train_samples_per_second": 9.984,
1640
  "train_steps_per_second": 0.312
1641
  }
1642
  ],