BounharAbdelaziz commited on
Commit
f7f71db
1 Parent(s): b1d6a31

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -11,17 +11,17 @@
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 16,
14
- "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
 
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 16,
14
+ "lora_dropout": 0.15,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 256,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d0e0707e43a9e9fe7d2f680b16f042b41e8e8a07c4c7dce34dcf3f2c2d39b95
3
- size 51132072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bead1aedc8910fcc9b2bd6be5c43e6f3f09b2eb2adfd053c047dcbd3112bce7f
3
+ size 204486560
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dda9cf296b67e694f8058fbc420ad9891df7d5fe78a2bf4cf4791dddc2a8ca78
3
  size 4988025488
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:913606a6440e60476970cd6cd86be1b695d3311ede7dd8cc86416552ee2ac500
3
  size 4988025488
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07800b1e30a60616187dd6ffdc7ffaeaa002b792d17dee431c08017b67880c39
3
  size 240691712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a0556278ab291851893954c2da87ed3fe2e20fe2c813ae91ac11062678129ec
3
  size 240691712
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4fddafdc0a262675c005f0fb1a82f0dc59ef45e773b2b4554bbdbe475b4f8c3
3
- size 102300346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a58c1393e0480d870912037bcdf402732a8a7aa79ebdfce9c548518490a2546
3
+ size 409009786
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d81f0f10258c1cba56da4c28c08d82046c7a091d96fe74cf2719a38ac82e7cfe
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:457fde7689fecd18a1e7e772622122b6415ae5f684c1c653fda8583567535a48
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c8bc5ca62c62ea2472d0194e675c1232665d8163971e96e1dc15b94be793fc1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2103bfcabaab6163aebcdf8618dc42afda820d0d3cde332bbcda47ae31fee6
3
  size 1064
trainer_state.json CHANGED
@@ -1,2412 +1,1215 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 34247,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0029199637924489734,
13
- "grad_norm": NaN,
14
- "learning_rate": 1.906614785992218e-05,
15
- "loss": 4.8545,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.005839927584897947,
20
- "grad_norm": 2.070359230041504,
21
- "learning_rate": 3.852140077821012e-05,
22
- "loss": 4.5735,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.008759891377346922,
27
- "grad_norm": 2.1655333042144775,
28
- "learning_rate": 5.7976653696498064e-05,
29
- "loss": 4.2565,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.011679855169795894,
34
- "grad_norm": 3.0300276279449463,
35
- "learning_rate": 7.7431906614786e-05,
36
- "loss": 4.157,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.014599818962244868,
41
- "grad_norm": 2.706392526626587,
42
- "learning_rate": 9.688715953307394e-05,
43
- "loss": 4.0463,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 0.017519782754693843,
48
- "grad_norm": 3.1594862937927246,
49
- "learning_rate": 0.00011634241245136189,
50
- "loss": 3.9791,
51
  "step": 600
52
  },
53
  {
54
- "epoch": 0.020439746547142815,
55
- "grad_norm": 3.447601556777954,
56
- "learning_rate": 0.0001357976653696498,
57
- "loss": 3.9192,
58
  "step": 700
59
  },
60
  {
61
- "epoch": 0.023359710339591788,
62
- "grad_norm": 2.8876287937164307,
63
- "learning_rate": 0.00015525291828793775,
64
- "loss": 3.9025,
65
  "step": 800
66
  },
67
  {
68
- "epoch": 0.026279674132040763,
69
- "grad_norm": 2.7174556255340576,
70
- "learning_rate": 0.0001747081712062257,
71
- "loss": 3.7831,
72
  "step": 900
73
  },
74
  {
75
- "epoch": 0.029199637924489735,
76
- "grad_norm": 3.173799753189087,
77
- "learning_rate": 0.00019416342412451363,
78
- "loss": 3.7109,
79
  "step": 1000
80
  },
81
  {
82
- "epoch": 0.03211960171693871,
83
- "grad_norm": 3.4008660316467285,
84
- "learning_rate": 0.00019999780875400972,
85
- "loss": 3.6651,
86
  "step": 1100
87
  },
88
  {
89
- "epoch": 0.035039565509387686,
90
- "grad_norm": 3.263437509536743,
91
- "learning_rate": 0.00019998707635176594,
92
- "loss": 3.5635,
93
  "step": 1200
94
  },
95
  {
96
- "epoch": 0.03795952930183666,
97
- "grad_norm": 4.769214153289795,
98
- "learning_rate": 0.000199967401278203,
99
- "loss": 3.4451,
100
  "step": 1300
101
  },
102
  {
103
- "epoch": 0.04087949309428563,
104
- "grad_norm": 3.8638391494750977,
105
- "learning_rate": 0.0001999387852930255,
106
- "loss": 3.406,
107
  "step": 1400
108
  },
109
  {
110
- "epoch": 0.0437994568867346,
111
- "grad_norm": 3.9781579971313477,
112
- "learning_rate": 0.00019990123095559763,
113
- "loss": 3.3305,
114
  "step": 1500
115
  },
116
  {
117
- "epoch": 0.046719420679183575,
118
- "grad_norm": 4.234069347381592,
119
- "learning_rate": 0.00019985474162471448,
120
- "loss": 3.2528,
121
  "step": 1600
122
  },
123
  {
124
- "epoch": 0.049639384471632554,
125
- "grad_norm": 3.5920116901397705,
126
- "learning_rate": 0.00019979932145830148,
127
- "loss": 3.1945,
128
  "step": 1700
129
  },
130
  {
131
- "epoch": 0.052559348264081526,
132
- "grad_norm": 4.009890556335449,
133
- "learning_rate": 0.00019973497541304251,
134
- "loss": 3.0998,
135
  "step": 1800
136
  },
137
  {
138
- "epoch": 0.0554793120565305,
139
- "grad_norm": 3.665900707244873,
140
- "learning_rate": 0.00019966170924393665,
141
- "loss": 2.9929,
142
  "step": 1900
143
  },
144
  {
145
- "epoch": 0.05839927584897947,
146
- "grad_norm": 5.364969730377197,
147
- "learning_rate": 0.00019957952950378348,
148
- "loss": 2.9956,
149
  "step": 2000
150
  },
151
  {
152
- "epoch": 0.06131923964142845,
153
- "grad_norm": 5.028293132781982,
154
- "learning_rate": 0.00019948844354259694,
155
- "loss": 2.9441,
156
  "step": 2100
157
  },
158
  {
159
- "epoch": 0.06423920343387741,
160
- "grad_norm": 4.162110328674316,
161
- "learning_rate": 0.0001993895033640701,
162
- "loss": 2.8629,
163
  "step": 2200
164
  },
165
  {
166
- "epoch": 0.0671591672263264,
167
- "grad_norm": 5.760902404785156,
168
- "learning_rate": 0.0001992807190408027,
169
- "loss": 2.8007,
170
  "step": 2300
171
  },
172
  {
173
- "epoch": 0.07007913101877537,
174
- "grad_norm": 5.505645751953125,
175
- "learning_rate": 0.00019916305522159333,
176
- "loss": 2.7621,
177
  "step": 2400
178
  },
179
  {
180
- "epoch": 0.07299909481122434,
181
- "grad_norm": 4.4763946533203125,
182
- "learning_rate": 0.00019903652243009065,
183
- "loss": 2.7013,
184
  "step": 2500
185
  },
186
  {
187
- "epoch": 0.07591905860367332,
188
- "grad_norm": 4.271712779998779,
189
- "learning_rate": 0.00019890113198316889,
190
- "loss": 2.618,
191
  "step": 2600
192
  },
193
  {
194
- "epoch": 0.07883902239612228,
195
- "grad_norm": 5.071974277496338,
196
- "learning_rate": 0.00019875689598991564,
197
- "loss": 2.5495,
198
  "step": 2700
199
  },
200
  {
201
- "epoch": 0.08175898618857126,
202
- "grad_norm": 4.088096618652344,
203
- "learning_rate": 0.0001986038273505489,
204
- "loss": 2.4626,
205
  "step": 2800
206
  },
207
  {
208
- "epoch": 0.08467894998102024,
209
- "grad_norm": 6.447364807128906,
210
- "learning_rate": 0.0001984419397552632,
211
- "loss": 2.455,
212
  "step": 2900
213
  },
214
  {
215
- "epoch": 0.0875989137734692,
216
- "grad_norm": 5.120512008666992,
217
- "learning_rate": 0.0001982712476830054,
218
- "loss": 2.4463,
219
  "step": 3000
220
  },
221
  {
222
- "epoch": 0.09051887756591818,
223
- "grad_norm": 4.25977087020874,
224
- "learning_rate": 0.0001980917664001794,
225
- "loss": 2.3021,
226
  "step": 3100
227
  },
228
  {
229
- "epoch": 0.09343884135836715,
230
- "grad_norm": 7.021845817565918,
231
- "learning_rate": 0.0001979035119592809,
232
- "loss": 2.2682,
233
  "step": 3200
234
  },
235
  {
236
- "epoch": 0.09635880515081613,
237
- "grad_norm": 5.458634853363037,
238
- "learning_rate": 0.00019770650119746192,
239
- "loss": 2.2192,
240
  "step": 3300
241
  },
242
  {
243
- "epoch": 0.09927876894326511,
244
- "grad_norm": 7.42665433883667,
245
- "learning_rate": 0.00019750075173502444,
246
- "loss": 2.1495,
247
  "step": 3400
248
  },
249
  {
250
- "epoch": 0.10219873273571407,
251
- "grad_norm": 5.143280029296875,
252
- "learning_rate": 0.00019728628197384485,
253
- "loss": 2.1331,
254
  "step": 3500
255
  },
256
  {
257
- "epoch": 0.10511869652816305,
258
- "grad_norm": 6.114063739776611,
259
- "learning_rate": 0.00019706311109572792,
260
- "loss": 2.1047,
261
  "step": 3600
262
  },
263
  {
264
- "epoch": 0.10803866032061202,
265
- "grad_norm": 7.758254051208496,
266
- "learning_rate": 0.00019683362048532321,
267
- "loss": 2.0496,
268
  "step": 3700
269
  },
270
  {
271
- "epoch": 0.110958624113061,
272
- "grad_norm": 5.546494483947754,
273
- "learning_rate": 0.00019659319452882684,
274
- "loss": 1.9042,
275
  "step": 3800
276
  },
277
  {
278
- "epoch": 0.11387858790550998,
279
- "grad_norm": 6.260044574737549,
280
- "learning_rate": 0.0001963441294439365,
281
- "loss": 1.9357,
282
  "step": 3900
283
  },
284
  {
285
- "epoch": 0.11679855169795894,
286
- "grad_norm": 6.014352321624756,
287
- "learning_rate": 0.00019608644750660297,
288
- "loss": 1.8152,
289
  "step": 4000
290
  },
291
  {
292
- "epoch": 0.11971851549040792,
293
- "grad_norm": 4.853186130523682,
294
- "learning_rate": 0.00019582017176345346,
295
- "loss": 1.7449,
296
  "step": 4100
297
  },
298
  {
299
- "epoch": 0.1226384792828569,
300
- "grad_norm": 4.977631568908691,
301
- "learning_rate": 0.0001955453260297302,
302
- "loss": 1.6956,
303
  "step": 4200
304
  },
305
  {
306
- "epoch": 0.12555844307530586,
307
- "grad_norm": 3.9587907791137695,
308
- "learning_rate": 0.0001952619348871607,
309
- "loss": 1.6859,
310
  "step": 4300
311
  },
312
  {
313
- "epoch": 0.12847840686775483,
314
- "grad_norm": 6.316972255706787,
315
- "learning_rate": 0.000194970023681759,
316
- "loss": 1.6503,
317
  "step": 4400
318
  },
319
  {
320
- "epoch": 0.13139837066020382,
321
- "grad_norm": 5.7269368171691895,
322
- "learning_rate": 0.0001946726645306457,
323
- "loss": 1.6513,
324
  "step": 4500
325
  },
326
  {
327
- "epoch": 0.1343183344526528,
328
- "grad_norm": 6.6521525382995605,
329
- "learning_rate": 0.00019436387681875253,
330
- "loss": 1.6014,
331
  "step": 4600
332
  },
333
  {
334
- "epoch": 0.13723829824510175,
335
- "grad_norm": 7.724483013153076,
336
- "learning_rate": 0.0001940466493647889,
337
- "loss": 1.4944,
338
  "step": 4700
339
  },
340
  {
341
- "epoch": 0.14015826203755075,
342
- "grad_norm": 7.2344746589660645,
343
- "learning_rate": 0.00019372101054103004,
344
- "loss": 1.4505,
345
  "step": 4800
346
  },
347
  {
348
- "epoch": 0.1430782258299997,
349
- "grad_norm": 6.845386981964111,
350
- "learning_rate": 0.0001933869894720496,
351
- "loss": 1.4348,
352
  "step": 4900
353
  },
354
  {
355
- "epoch": 0.14599818962244868,
356
- "grad_norm": 6.9420552253723145,
357
- "learning_rate": 0.0001930446160321148,
358
- "loss": 1.389,
359
  "step": 5000
360
  },
361
  {
362
- "epoch": 0.14891815341489764,
363
- "grad_norm": 4.138674736022949,
364
- "learning_rate": 0.00019269392084251438,
365
- "loss": 1.3654,
366
  "step": 5100
367
  },
368
  {
369
- "epoch": 0.15183811720734663,
370
- "grad_norm": 3.841850519180298,
371
- "learning_rate": 0.00019233493526882008,
372
- "loss": 1.3368,
373
  "step": 5200
374
  },
375
  {
376
- "epoch": 0.1547580809997956,
377
- "grad_norm": 4.870452880859375,
378
- "learning_rate": 0.00019196769141808124,
379
- "loss": 1.3552,
380
  "step": 5300
381
  },
382
  {
383
- "epoch": 0.15767804479224456,
384
- "grad_norm": 5.648431301116943,
385
- "learning_rate": 0.00019159222213595325,
386
- "loss": 1.3289,
387
  "step": 5400
388
  },
389
  {
390
- "epoch": 0.16059800858469356,
391
- "grad_norm": 6.571835041046143,
392
- "learning_rate": 0.0001912085610037598,
393
- "loss": 1.2027,
394
  "step": 5500
395
  },
396
  {
397
- "epoch": 0.16351797237714252,
398
- "grad_norm": 4.793326377868652,
399
- "learning_rate": 0.00019081674233548964,
400
- "loss": 1.2058,
401
  "step": 5600
402
  },
403
  {
404
- "epoch": 0.1664379361695915,
405
- "grad_norm": 6.252863883972168,
406
- "learning_rate": 0.00019041680117472726,
407
- "loss": 1.1419,
408
  "step": 5700
409
  },
410
  {
411
- "epoch": 0.16935789996204048,
412
- "grad_norm": 5.3782124519348145,
413
- "learning_rate": 0.00019000877329151893,
414
- "loss": 1.1386,
415
  "step": 5800
416
  },
417
  {
418
- "epoch": 0.17227786375448945,
419
- "grad_norm": 8.507604598999023,
420
- "learning_rate": 0.0001895926951791734,
421
- "loss": 1.1592,
422
  "step": 5900
423
  },
424
  {
425
- "epoch": 0.1751978275469384,
426
- "grad_norm": 7.945569038391113,
427
- "learning_rate": 0.00018916860405099808,
428
- "loss": 1.0545,
429
  "step": 6000
430
  },
431
  {
432
- "epoch": 0.1781177913393874,
433
- "grad_norm": 6.228450775146484,
434
- "learning_rate": 0.00018873653783697055,
435
- "loss": 1.1276,
436
  "step": 6100
437
  },
438
  {
439
- "epoch": 0.18103775513183637,
440
- "grad_norm": 4.703051567077637,
441
- "learning_rate": 0.0001882965351803463,
442
- "loss": 1.0775,
443
  "step": 6200
444
  },
445
  {
446
- "epoch": 0.18395771892428534,
447
- "grad_norm": 7.743706226348877,
448
- "learning_rate": 0.00018784863543420258,
449
- "loss": 1.0611,
450
  "step": 6300
451
  },
452
  {
453
- "epoch": 0.1868776827167343,
454
- "grad_norm": 7.018481254577637,
455
- "learning_rate": 0.0001873928786579187,
456
- "loss": 0.9811,
457
  "step": 6400
458
  },
459
  {
460
- "epoch": 0.1897976465091833,
461
- "grad_norm": 6.171621799468994,
462
- "learning_rate": 0.00018692930561359307,
463
- "loss": 1.0307,
464
  "step": 6500
465
  },
466
  {
467
- "epoch": 0.19271761030163226,
468
- "grad_norm": 9.385793685913086,
469
- "learning_rate": 0.0001864579577623977,
470
- "loss": 1.0374,
471
  "step": 6600
472
  },
473
  {
474
- "epoch": 0.19563757409408122,
475
- "grad_norm": 10.805081367492676,
476
- "learning_rate": 0.00018597887726086985,
477
- "loss": 1.0033,
478
  "step": 6700
479
  },
480
  {
481
- "epoch": 0.19855753788653022,
482
- "grad_norm": 4.207973957061768,
483
- "learning_rate": 0.00018549210695714162,
484
- "loss": 0.9768,
485
  "step": 6800
486
  },
487
  {
488
- "epoch": 0.20147750167897918,
489
- "grad_norm": 4.130248069763184,
490
- "learning_rate": 0.0001849976903871079,
491
- "loss": 0.9005,
492
  "step": 6900
493
  },
494
  {
495
- "epoch": 0.20439746547142815,
496
- "grad_norm": 4.633413791656494,
497
- "learning_rate": 0.00018449567177053215,
498
- "loss": 0.9012,
499
  "step": 7000
500
  },
501
  {
502
- "epoch": 0.20731742926387714,
503
- "grad_norm": 4.062277793884277,
504
- "learning_rate": 0.000183986096007092,
505
- "loss": 0.8939,
506
  "step": 7100
507
  },
508
  {
509
- "epoch": 0.2102373930563261,
510
- "grad_norm": 8.527688026428223,
511
- "learning_rate": 0.00018346900867236305,
512
- "loss": 0.9296,
513
  "step": 7200
514
  },
515
  {
516
- "epoch": 0.21315735684877507,
517
- "grad_norm": 8.69731616973877,
518
- "learning_rate": 0.000182944456013743,
519
- "loss": 0.885,
520
  "step": 7300
521
  },
522
  {
523
- "epoch": 0.21607732064122404,
524
- "grad_norm": 5.987290382385254,
525
- "learning_rate": 0.00018241248494631505,
526
- "loss": 0.8927,
527
  "step": 7400
528
  },
529
  {
530
- "epoch": 0.21899728443367303,
531
- "grad_norm": 4.5624566078186035,
532
- "learning_rate": 0.00018187314304865225,
533
- "loss": 0.7979,
534
  "step": 7500
535
  },
536
  {
537
- "epoch": 0.221917248226122,
538
- "grad_norm": 6.830573081970215,
539
- "learning_rate": 0.00018132647855856188,
540
- "loss": 0.7968,
541
  "step": 7600
542
  },
543
  {
544
- "epoch": 0.22483721201857096,
545
- "grad_norm": 6.950575828552246,
546
- "learning_rate": 0.00018077254036877118,
547
- "loss": 0.8561,
548
  "step": 7700
549
  },
550
  {
551
- "epoch": 0.22775717581101995,
552
- "grad_norm": 3.847403049468994,
553
- "learning_rate": 0.00018021137802255458,
554
- "loss": 0.773,
555
  "step": 7800
556
  },
557
  {
558
- "epoch": 0.23067713960346892,
559
- "grad_norm": 4.9719133377075195,
560
- "learning_rate": 0.0001796430417093026,
561
- "loss": 0.7673,
562
  "step": 7900
563
  },
564
  {
565
- "epoch": 0.23359710339591788,
566
- "grad_norm": 6.954914569854736,
567
- "learning_rate": 0.0001790675822600329,
568
- "loss": 0.8196,
569
  "step": 8000
570
  },
571
  {
572
- "epoch": 0.23651706718836688,
573
- "grad_norm": 5.7400336265563965,
574
- "learning_rate": 0.00017848505114284423,
575
- "loss": 0.7128,
576
  "step": 8100
577
  },
578
  {
579
- "epoch": 0.23943703098081584,
580
- "grad_norm": 6.180181503295898,
581
- "learning_rate": 0.00017789550045831294,
582
- "loss": 0.8407,
583
  "step": 8200
584
  },
585
  {
586
- "epoch": 0.2423569947732648,
587
- "grad_norm": 7.137686729431152,
588
- "learning_rate": 0.00017729898293483348,
589
- "loss": 0.7785,
590
  "step": 8300
591
  },
592
  {
593
- "epoch": 0.2452769585657138,
594
- "grad_norm": 3.9680042266845703,
595
- "learning_rate": 0.00017669555192390235,
596
- "loss": 0.7693,
597
  "step": 8400
598
  },
599
  {
600
- "epoch": 0.24819692235816276,
601
- "grad_norm": 6.744853496551514,
602
- "learning_rate": 0.00017608526139534626,
603
- "loss": 0.7528,
604
  "step": 8500
605
  },
606
  {
607
- "epoch": 0.25111688615061173,
608
- "grad_norm": 3.3069093227386475,
609
- "learning_rate": 0.00017546816593249552,
610
- "loss": 0.7815,
611
  "step": 8600
612
  },
613
  {
614
- "epoch": 0.2540368499430607,
615
- "grad_norm": 4.526154041290283,
616
- "learning_rate": 0.00017484432072730185,
617
- "loss": 0.7127,
618
  "step": 8700
619
  },
620
  {
621
- "epoch": 0.25695681373550966,
622
- "grad_norm": NaN,
623
- "learning_rate": 0.00017422011991728064,
624
- "loss": 0.7122,
625
  "step": 8800
626
  },
627
  {
628
- "epoch": 0.2598767775279587,
629
- "grad_norm": 4.636015892028809,
630
- "learning_rate": 0.00017358300930742573,
631
- "loss": 0.7137,
632
  "step": 8900
633
  },
634
  {
635
- "epoch": 0.26279674132040765,
636
- "grad_norm": 5.9968180656433105,
637
- "learning_rate": 0.0001729393175603787,
638
- "loss": 0.7437,
639
  "step": 9000
640
  },
641
  {
642
- "epoch": 0.2657167051128566,
643
- "grad_norm": 3.8191442489624023,
644
- "learning_rate": 0.00017228910224681697,
645
- "loss": 0.7169,
646
  "step": 9100
647
  },
648
  {
649
- "epoch": 0.2686366689053056,
650
- "grad_norm": 7.120567321777344,
651
- "learning_rate": 0.00017163242152087461,
652
- "loss": 0.7889,
653
  "step": 9200
654
  },
655
  {
656
- "epoch": 0.27155663269775454,
657
- "grad_norm": 6.114504337310791,
658
- "learning_rate": 0.0001709693341149409,
659
- "loss": 0.7305,
660
  "step": 9300
661
  },
662
  {
663
- "epoch": 0.2744765964902035,
664
- "grad_norm": 5.748324394226074,
665
- "learning_rate": 0.00017029989933440754,
666
- "loss": 0.6722,
667
  "step": 9400
668
  },
669
  {
670
- "epoch": 0.27739656028265247,
671
- "grad_norm": 9.556855201721191,
672
- "learning_rate": 0.00016962417705236436,
673
- "loss": 0.7727,
674
  "step": 9500
675
  },
676
  {
677
- "epoch": 0.2803165240751015,
678
- "grad_norm": 6.288482189178467,
679
- "learning_rate": 0.00016894907782189354,
680
- "loss": 0.725,
681
  "step": 9600
682
  },
683
  {
684
- "epoch": 0.28323648786755046,
685
- "grad_norm": 8.286351203918457,
686
- "learning_rate": 0.00016826102375707788,
687
- "loss": 0.7074,
688
  "step": 9700
689
  },
690
  {
691
- "epoch": 0.2861564516599994,
692
- "grad_norm": 6.121710777282715,
693
- "learning_rate": 0.00016756686454426069,
694
- "loss": 0.7093,
695
  "step": 9800
696
  },
697
  {
698
- "epoch": 0.2890764154524484,
699
- "grad_norm": 4.868896007537842,
700
- "learning_rate": 0.00016686666226784234,
701
- "loss": 0.6829,
702
  "step": 9900
703
  },
704
  {
705
- "epoch": 0.29199637924489735,
706
- "grad_norm": 3.8444511890411377,
707
- "learning_rate": 0.00016616047955270445,
708
- "loss": 0.6467,
709
  "step": 10000
710
  },
711
  {
712
- "epoch": 0.2949163430373463,
713
- "grad_norm": 6.739651679992676,
714
- "learning_rate": 0.00016544837955860867,
715
- "loss": 0.6722,
716
  "step": 10100
717
  },
718
  {
719
- "epoch": 0.2978363068297953,
720
- "grad_norm": 6.124034404754639,
721
- "learning_rate": 0.00016473042597454782,
722
- "loss": 0.7016,
723
  "step": 10200
724
  },
725
  {
726
- "epoch": 0.3007562706222443,
727
- "grad_norm": 7.008491039276123,
728
- "learning_rate": 0.00016400668301304983,
729
- "loss": 0.6901,
730
  "step": 10300
731
  },
732
  {
733
- "epoch": 0.30367623441469327,
734
- "grad_norm": 3.4143664836883545,
735
- "learning_rate": 0.00016327721540443436,
736
- "loss": 0.6092,
737
  "step": 10400
738
  },
739
  {
740
- "epoch": 0.30659619820714223,
741
- "grad_norm": 2.6440017223358154,
742
- "learning_rate": 0.00016254208839102386,
743
- "loss": 0.6216,
744
  "step": 10500
745
  },
746
  {
747
- "epoch": 0.3095161619995912,
748
- "grad_norm": 6.059576511383057,
749
- "learning_rate": 0.0001618013677213079,
750
- "loss": 0.6949,
751
  "step": 10600
752
  },
753
  {
754
- "epoch": 0.31243612579204016,
755
- "grad_norm": 6.078047752380371,
756
- "learning_rate": 0.0001610551196440632,
757
- "loss": 0.6841,
758
  "step": 10700
759
  },
760
  {
761
- "epoch": 0.31535608958448913,
762
- "grad_norm": 5.9573655128479,
763
- "learning_rate": 0.0001603034109024281,
764
- "loss": 0.6657,
765
  "step": 10800
766
  },
767
  {
768
- "epoch": 0.31827605337693815,
769
- "grad_norm": 4.991495132446289,
770
- "learning_rate": 0.00015954630872793346,
771
- "loss": 0.6755,
772
  "step": 10900
773
  },
774
  {
775
- "epoch": 0.3211960171693871,
776
- "grad_norm": 12.312626838684082,
777
- "learning_rate": 0.00015878388083448943,
778
- "loss": 0.688,
779
  "step": 11000
780
  },
781
  {
782
- "epoch": 0.3241159809618361,
783
- "grad_norm": 5.133008003234863,
784
- "learning_rate": 0.0001580161954123292,
785
- "loss": 0.6165,
786
  "step": 11100
787
  },
788
  {
789
- "epoch": 0.32703594475428505,
790
- "grad_norm": 7.20357084274292,
791
- "learning_rate": 0.00015724332112191028,
792
- "loss": 0.6942,
793
  "step": 11200
794
  },
795
  {
796
- "epoch": 0.329955908546734,
797
- "grad_norm": 7.373988628387451,
798
- "learning_rate": 0.00015646532708777356,
799
- "loss": 0.6662,
800
  "step": 11300
801
  },
802
  {
803
- "epoch": 0.332875872339183,
804
- "grad_norm": 5.183440685272217,
805
- "learning_rate": 0.00015568228289236102,
806
- "loss": 0.6832,
807
  "step": 11400
808
  },
809
  {
810
- "epoch": 0.33579583613163194,
811
- "grad_norm": 2.848907709121704,
812
- "learning_rate": 0.0001548942585697923,
813
- "loss": 0.5977,
814
  "step": 11500
815
  },
816
  {
817
- "epoch": 0.33871579992408096,
818
- "grad_norm": 6.83838415145874,
819
- "learning_rate": 0.000154101324599601,
820
- "loss": 0.671,
821
  "step": 11600
822
  },
823
  {
824
- "epoch": 0.3416357637165299,
825
- "grad_norm": 10.248309135437012,
826
- "learning_rate": 0.0001533035519004311,
827
- "loss": 0.6546,
828
  "step": 11700
829
  },
830
  {
831
- "epoch": 0.3445557275089789,
832
- "grad_norm": 4.799848556518555,
833
- "learning_rate": 0.00015250101182369421,
834
- "loss": 0.6847,
835
  "step": 11800
836
  },
837
  {
838
- "epoch": 0.34747569130142786,
839
- "grad_norm": 7.513195514678955,
840
- "learning_rate": 0.0001516937761471879,
841
- "loss": 0.6531,
842
  "step": 11900
843
  },
844
  {
845
- "epoch": 0.3503956550938768,
846
- "grad_norm": 5.581793785095215,
847
- "learning_rate": 0.00015088191706867608,
848
- "loss": 0.6354,
849
  "step": 12000
850
  },
851
  {
852
- "epoch": 0.3533156188863258,
853
- "grad_norm": 7.360459804534912,
854
- "learning_rate": 0.00015007369358511694,
855
- "loss": 0.6267,
856
  "step": 12100
857
  },
858
  {
859
- "epoch": 0.3562355826787748,
860
- "grad_norm": 8.538688659667969,
861
- "learning_rate": 0.0001492528503583914,
862
- "loss": 0.7001,
863
  "step": 12200
864
  },
865
  {
866
- "epoch": 0.3591555464712238,
867
- "grad_norm": 5.245034217834473,
868
- "learning_rate": 0.00014842760204184465,
869
- "loss": 0.6134,
870
  "step": 12300
871
  },
872
  {
873
- "epoch": 0.36207551026367274,
874
- "grad_norm": 8.417107582092285,
875
- "learning_rate": 0.00014759802244426005,
876
- "loss": 0.6335,
877
  "step": 12400
878
  },
879
  {
880
- "epoch": 0.3649954740561217,
881
- "grad_norm": 3.8830833435058594,
882
- "learning_rate": 0.00014676418576180312,
883
- "loss": 0.6585,
884
  "step": 12500
885
  },
886
  {
887
- "epoch": 0.36791543784857067,
888
- "grad_norm": 2.1041855812072754,
889
- "learning_rate": 0.00014592616657138586,
890
- "loss": 0.591,
891
  "step": 12600
892
  },
893
  {
894
- "epoch": 0.37083540164101964,
895
- "grad_norm": 6.608900547027588,
896
- "learning_rate": 0.00014508403982399636,
897
- "loss": 0.649,
898
  "step": 12700
899
  },
900
  {
901
- "epoch": 0.3737553654334686,
902
- "grad_norm": 3.221425771713257,
903
- "learning_rate": 0.0001442378808379957,
904
- "loss": 0.6272,
905
  "step": 12800
906
  },
907
  {
908
- "epoch": 0.3766753292259176,
909
- "grad_norm": 5.6488037109375,
910
- "learning_rate": 0.00014338776529238128,
911
- "loss": 0.6104,
912
  "step": 12900
913
  },
914
  {
915
- "epoch": 0.3795952930183666,
916
- "grad_norm": 5.478495121002197,
917
- "learning_rate": 0.0001425337692200184,
918
- "loss": 0.6142,
919
  "step": 13000
920
  },
921
  {
922
- "epoch": 0.38251525681081555,
923
- "grad_norm": 5.134850978851318,
924
- "learning_rate": 0.00014167596900083998,
925
- "loss": 0.6702,
926
  "step": 13100
927
  },
928
  {
929
- "epoch": 0.3854352206032645,
930
- "grad_norm": 6.333113670349121,
931
- "learning_rate": 0.0001408144413550152,
932
- "loss": 0.5947,
933
  "step": 13200
934
  },
935
  {
936
- "epoch": 0.3883551843957135,
937
- "grad_norm": 2.1404473781585693,
938
- "learning_rate": 0.00013994926333608793,
939
- "loss": 0.6002,
940
  "step": 13300
941
  },
942
  {
943
- "epoch": 0.39127514818816245,
944
- "grad_norm": 6.199524402618408,
945
- "learning_rate": 0.000139080512324085,
946
- "loss": 0.6886,
947
  "step": 13400
948
  },
949
  {
950
- "epoch": 0.39419511198061147,
951
- "grad_norm": 9.652315139770508,
952
- "learning_rate": 0.00013820826601859568,
953
- "loss": 0.6008,
954
  "step": 13500
955
  },
956
  {
957
- "epoch": 0.39711507577306043,
958
- "grad_norm": 4.127632141113281,
959
- "learning_rate": 0.0001373326024318221,
960
- "loss": 0.5349,
961
  "step": 13600
962
  },
963
  {
964
- "epoch": 0.4000350395655094,
965
- "grad_norm": 5.289997100830078,
966
- "learning_rate": 0.00013645359988160232,
967
- "loss": 0.5426,
968
  "step": 13700
969
  },
970
  {
971
- "epoch": 0.40295500335795836,
972
- "grad_norm": 5.238863945007324,
973
- "learning_rate": 0.0001355713369844053,
974
- "loss": 0.5761,
975
  "step": 13800
976
  },
977
  {
978
- "epoch": 0.40587496715040733,
979
- "grad_norm": 2.4321208000183105,
980
- "learning_rate": 0.0001346858926483,
981
- "loss": 0.5926,
982
  "step": 13900
983
  },
984
  {
985
- "epoch": 0.4087949309428563,
986
- "grad_norm": 8.464098930358887,
987
- "learning_rate": 0.0001337973460658976,
988
- "loss": 0.5804,
989
  "step": 14000
990
  },
991
  {
992
- "epoch": 0.41171489473530526,
993
- "grad_norm": 3.87615704536438,
994
- "learning_rate": 0.000132905776707269,
995
- "loss": 0.5995,
996
  "step": 14100
997
  },
998
  {
999
- "epoch": 0.4146348585277543,
1000
- "grad_norm": 6.268438339233398,
1001
- "learning_rate": 0.0001320112643128368,
1002
- "loss": 0.5306,
1003
  "step": 14200
1004
  },
1005
  {
1006
- "epoch": 0.41755482232020324,
1007
- "grad_norm": 3.1259851455688477,
1008
- "learning_rate": 0.00013111388888624386,
1009
- "loss": 0.591,
1010
  "step": 14300
1011
  },
1012
  {
1013
- "epoch": 0.4204747861126522,
1014
- "grad_norm": 5.6857829093933105,
1015
- "learning_rate": 0.00013021373068719742,
1016
- "loss": 0.6046,
1017
  "step": 14400
1018
  },
1019
  {
1020
- "epoch": 0.4233947499051012,
1021
- "grad_norm": 6.713433742523193,
1022
- "learning_rate": 0.00012931087022429133,
1023
- "loss": 0.6183,
1024
  "step": 14500
1025
  },
1026
  {
1027
- "epoch": 0.42631471369755014,
1028
- "grad_norm": 3.385892152786255,
1029
- "learning_rate": 0.00012841445577833958,
1030
- "loss": 0.578,
1031
  "step": 14600
1032
  },
1033
  {
1034
- "epoch": 0.4292346774899991,
1035
- "grad_norm": 9.377425193786621,
1036
- "learning_rate": 0.00012750645827667818,
1037
- "loss": 0.6234,
1038
  "step": 14700
1039
  },
1040
  {
1041
- "epoch": 0.43215464128244807,
1042
- "grad_norm": 2.394103765487671,
1043
- "learning_rate": 0.00012659600064492303,
1044
- "loss": 0.5021,
1045
  "step": 14800
1046
  },
1047
  {
1048
- "epoch": 0.4350746050748971,
1049
- "grad_norm": 7.460124492645264,
1050
- "learning_rate": 0.00012568316431283124,
1051
- "loss": 0.5953,
1052
  "step": 14900
1053
  },
1054
  {
1055
- "epoch": 0.43799456886734606,
1056
- "grad_norm": 3.1461405754089355,
1057
- "learning_rate": 0.00012476803092290696,
1058
- "loss": 0.5579,
1059
  "step": 15000
1060
  },
1061
  {
1062
- "epoch": 0.440914532659795,
1063
- "grad_norm": 6.676076412200928,
1064
- "learning_rate": 0.00012385068232309906,
1065
- "loss": 0.6248,
1066
  "step": 15100
1067
  },
1068
  {
1069
- "epoch": 0.443834496452244,
1070
- "grad_norm": 4.148955821990967,
1071
- "learning_rate": 0.00012293120055948103,
1072
- "loss": 0.5219,
1073
  "step": 15200
1074
  },
1075
  {
1076
- "epoch": 0.44675446024469295,
1077
- "grad_norm": 2.290743589401245,
1078
- "learning_rate": 0.00012200966786891287,
1079
- "loss": 0.5686,
1080
  "step": 15300
1081
  },
1082
  {
1083
- "epoch": 0.4496744240371419,
1084
- "grad_norm": 6.50786828994751,
1085
- "learning_rate": 0.00012108616667168586,
1086
- "loss": 0.5667,
1087
  "step": 15400
1088
  },
1089
  {
1090
- "epoch": 0.45259438782959094,
1091
- "grad_norm": 4.148983001708984,
1092
- "learning_rate": 0.00012016077956415119,
1093
- "loss": 0.5426,
1094
  "step": 15500
1095
  },
1096
  {
1097
- "epoch": 0.4555143516220399,
1098
- "grad_norm": 4.001679420471191,
1099
- "learning_rate": 0.00011923358931133255,
1100
- "loss": 0.568,
1101
  "step": 15600
1102
  },
1103
  {
1104
- "epoch": 0.45843431541448887,
1105
- "grad_norm": 3.7630743980407715,
1106
- "learning_rate": 0.00011830467883952387,
1107
- "loss": 0.5927,
1108
  "step": 15700
1109
  },
1110
  {
1111
- "epoch": 0.46135427920693783,
1112
- "grad_norm": 7.692181587219238,
1113
- "learning_rate": 0.00011737413122887249,
1114
- "loss": 0.6512,
1115
  "step": 15800
1116
  },
1117
  {
1118
- "epoch": 0.4642742429993868,
1119
- "grad_norm": 1.6598546504974365,
1120
- "learning_rate": 0.00011644202970594859,
1121
- "loss": 0.5219,
1122
  "step": 15900
1123
  },
1124
  {
1125
- "epoch": 0.46719420679183576,
1126
- "grad_norm": 6.290797710418701,
1127
- "learning_rate": 0.00011550845763630155,
1128
- "loss": 0.6026,
1129
  "step": 16000
1130
  },
1131
  {
1132
- "epoch": 0.47011417058428473,
1133
- "grad_norm": 7.756636619567871,
1134
- "learning_rate": 0.00011457349851700407,
1135
- "loss": 0.619,
1136
  "step": 16100
1137
  },
1138
  {
1139
- "epoch": 0.47303413437673375,
1140
- "grad_norm": 7.793608665466309,
1141
- "learning_rate": 0.00011363723596918397,
1142
- "loss": 0.634,
1143
  "step": 16200
1144
  },
1145
  {
1146
- "epoch": 0.4759540981691827,
1147
- "grad_norm": 3.12341046333313,
1148
- "learning_rate": 0.00011269975373054557,
1149
- "loss": 0.5714,
1150
  "step": 16300
1151
  },
1152
  {
1153
- "epoch": 0.4788740619616317,
1154
- "grad_norm": 6.63306999206543,
1155
- "learning_rate": 0.00011176113564788022,
1156
- "loss": 0.5377,
1157
  "step": 16400
1158
  },
1159
  {
1160
- "epoch": 0.48179402575408065,
1161
- "grad_norm": 3.5353915691375732,
1162
- "learning_rate": 0.0001108214656695672,
1163
- "loss": 0.5573,
1164
  "step": 16500
1165
  },
1166
  {
1167
- "epoch": 0.4847139895465296,
1168
- "grad_norm": 5.9525933265686035,
1169
- "learning_rate": 0.00010988082783806548,
1170
- "loss": 0.5805,
1171
  "step": 16600
1172
  },
1173
  {
1174
- "epoch": 0.4876339533389786,
1175
- "grad_norm": 5.2525410652160645,
1176
- "learning_rate": 0.00010893930628239719,
1177
- "loss": 0.5826,
1178
  "step": 16700
1179
  },
1180
  {
1181
- "epoch": 0.4905539171314276,
1182
- "grad_norm": 6.823146343231201,
1183
- "learning_rate": 0.00010799698521062318,
1184
- "loss": 0.5534,
1185
  "step": 16800
1186
  },
1187
  {
1188
- "epoch": 0.49347388092387656,
1189
- "grad_norm": 4.929098606109619,
1190
- "learning_rate": 0.00010705394890231171,
1191
- "loss": 0.5175,
1192
  "step": 16900
1193
  },
1194
  {
1195
- "epoch": 0.4963938447163255,
1196
- "grad_norm": 3.4094526767730713,
1197
- "learning_rate": 0.0001061102817010005,
1198
- "loss": 0.5499,
1199
  "step": 17000
1200
  },
1201
  {
1202
- "epoch": 0.4993138085087745,
1203
- "grad_norm": 3.1009860038757324,
1204
- "learning_rate": 0.00010516606800665324,
1205
- "loss": 0.5971,
1206
  "step": 17100
1207
- },
1208
- {
1209
- "epoch": 0.5022337723012235,
1210
- "grad_norm": 1.6674872636795044,
1211
- "learning_rate": 0.00010422139226811105,
1212
- "loss": 0.5571,
1213
- "step": 17200
1214
- },
1215
- {
1216
- "epoch": 0.5051537360936724,
1217
- "grad_norm": 5.221986293792725,
1218
- "learning_rate": 0.0001032763389755395,
1219
- "loss": 0.6719,
1220
- "step": 17300
1221
- },
1222
- {
1223
- "epoch": 0.5080736998861214,
1224
- "grad_norm": 5.639619827270508,
1225
- "learning_rate": 0.00010233099265287182,
1226
- "loss": 0.5489,
1227
- "step": 17400
1228
- },
1229
- {
1230
- "epoch": 0.5109936636785704,
1231
- "grad_norm": 5.451557636260986,
1232
- "learning_rate": 0.0001013854378502496,
1233
- "loss": 0.6173,
1234
- "step": 17500
1235
- },
1236
- {
1237
- "epoch": 0.5139136274710193,
1238
- "grad_norm": 5.069969177246094,
1239
- "learning_rate": 0.00010043975913646017,
1240
- "loss": 0.5701,
1241
- "step": 17600
1242
- },
1243
- {
1244
- "epoch": 0.5168335912634683,
1245
- "grad_norm": 1.3468672037124634,
1246
- "learning_rate": 9.949404109137361e-05,
1247
- "loss": 0.5582,
1248
- "step": 17700
1249
- },
1250
- {
1251
- "epoch": 0.5197535550559174,
1252
- "grad_norm": 7.0001420974731445,
1253
- "learning_rate": 9.854836829837741e-05,
1254
- "loss": 0.6168,
1255
- "step": 17800
1256
- },
1257
- {
1258
- "epoch": 0.5226735188483663,
1259
- "grad_norm": 2.2107551097869873,
1260
- "learning_rate": 9.760282533681206e-05,
1261
- "loss": 0.4888,
1262
- "step": 17900
1263
- },
1264
- {
1265
- "epoch": 0.5255934826408153,
1266
- "grad_norm": 5.56157922744751,
1267
- "learning_rate": 9.665749677440597e-05,
1268
- "loss": 0.5833,
1269
- "step": 18000
1270
- },
1271
- {
1272
- "epoch": 0.5285134464332643,
1273
- "grad_norm": 3.0002171993255615,
1274
- "learning_rate": 9.571246715971211e-05,
1275
- "loss": 0.5705,
1276
- "step": 18100
1277
- },
1278
- {
1279
- "epoch": 0.5314334102257132,
1280
- "grad_norm": 3.6839287281036377,
1281
- "learning_rate": 9.476782101454611e-05,
1282
- "loss": 0.5261,
1283
- "step": 18200
1284
- },
1285
- {
1286
- "epoch": 0.5343533740181622,
1287
- "grad_norm": 6.553487777709961,
1288
- "learning_rate": 9.382364282642675e-05,
1289
- "loss": 0.5936,
1290
- "step": 18300
1291
- },
1292
- {
1293
- "epoch": 0.5372733378106112,
1294
- "grad_norm": 6.64651346206665,
1295
- "learning_rate": 9.28800170410195e-05,
1296
- "loss": 0.5576,
1297
- "step": 18400
1298
- },
1299
- {
1300
- "epoch": 0.5401933016030601,
1301
- "grad_norm": 4.049524307250977,
1302
- "learning_rate": 9.193702805458388e-05,
1303
- "loss": 0.5326,
1304
- "step": 18500
1305
- },
1306
- {
1307
- "epoch": 0.5431132653955091,
1308
- "grad_norm": 3.995262861251831,
1309
- "learning_rate": 9.09947602064253e-05,
1310
- "loss": 0.577,
1311
- "step": 18600
1312
- },
1313
- {
1314
- "epoch": 0.546033229187958,
1315
- "grad_norm": 5.921045303344727,
1316
- "learning_rate": 9.005329777135172e-05,
1317
- "loss": 0.5901,
1318
- "step": 18700
1319
- },
1320
- {
1321
- "epoch": 0.548953192980407,
1322
- "grad_norm": 1.7855268716812134,
1323
- "learning_rate": 8.911272495213639e-05,
1324
- "loss": 0.5026,
1325
- "step": 18800
1326
- },
1327
- {
1328
- "epoch": 0.551873156772856,
1329
- "grad_norm": 2.2023262977600098,
1330
- "learning_rate": 8.818251676677135e-05,
1331
- "loss": 0.526,
1332
- "step": 18900
1333
- },
1334
- {
1335
- "epoch": 0.5547931205653049,
1336
- "grad_norm": 2.3260550498962402,
1337
- "learning_rate": 8.724396446837928e-05,
1338
- "loss": 0.5715,
1339
- "step": 19000
1340
- },
1341
- {
1342
- "epoch": 0.557713084357754,
1343
- "grad_norm": 4.143796920776367,
1344
- "learning_rate": 8.63065530477612e-05,
1345
- "loss": 0.5346,
1346
- "step": 19100
1347
- },
1348
- {
1349
- "epoch": 0.560633048150203,
1350
- "grad_norm": 3.3329811096191406,
1351
- "learning_rate": 8.537036634537457e-05,
1352
- "loss": 0.525,
1353
- "step": 19200
1354
- },
1355
- {
1356
- "epoch": 0.563553011942652,
1357
- "grad_norm": 2.1307923793792725,
1358
- "learning_rate": 8.443548809214026e-05,
1359
- "loss": 0.6003,
1360
- "step": 19300
1361
- },
1362
- {
1363
- "epoch": 0.5664729757351009,
1364
- "grad_norm": 9.149153709411621,
1365
- "learning_rate": 8.350200190195363e-05,
1366
- "loss": 0.5832,
1367
- "step": 19400
1368
- },
1369
- {
1370
- "epoch": 0.5693929395275499,
1371
- "grad_norm": 2.979999303817749,
1372
- "learning_rate": 8.25699912642064e-05,
1373
- "loss": 0.5697,
1374
- "step": 19500
1375
- },
1376
- {
1377
- "epoch": 0.5723129033199988,
1378
- "grad_norm": 3.4557132720947266,
1379
- "learning_rate": 8.163953953631926e-05,
1380
- "loss": 0.5883,
1381
- "step": 19600
1382
- },
1383
- {
1384
- "epoch": 0.5752328671124478,
1385
- "grad_norm": 5.705073833465576,
1386
- "learning_rate": 8.071072993628694e-05,
1387
- "loss": 0.5408,
1388
- "step": 19700
1389
- },
1390
- {
1391
- "epoch": 0.5781528309048968,
1392
- "grad_norm": 4.233023166656494,
1393
- "learning_rate": 7.978364553523492e-05,
1394
- "loss": 0.564,
1395
- "step": 19800
1396
- },
1397
- {
1398
- "epoch": 0.5810727946973457,
1399
- "grad_norm": 4.051882266998291,
1400
- "learning_rate": 7.885836924998998e-05,
1401
- "loss": 0.5367,
1402
- "step": 19900
1403
- },
1404
- {
1405
- "epoch": 0.5839927584897947,
1406
- "grad_norm": 6.308223724365234,
1407
- "learning_rate": 7.793498383566398e-05,
1408
- "loss": 0.5494,
1409
- "step": 20000
1410
- },
1411
- {
1412
- "epoch": 0.5869127222822437,
1413
- "grad_norm": 3.083712577819824,
1414
- "learning_rate": 7.701357187825272e-05,
1415
- "loss": 0.5712,
1416
- "step": 20100
1417
- },
1418
- {
1419
- "epoch": 0.5898326860746926,
1420
- "grad_norm": 4.624127388000488,
1421
- "learning_rate": 7.609421578724932e-05,
1422
- "loss": 0.6324,
1423
- "step": 20200
1424
- },
1425
- {
1426
- "epoch": 0.5927526498671416,
1427
- "grad_norm": 2.8166234493255615,
1428
- "learning_rate": 7.517699778827388e-05,
1429
- "loss": 0.5447,
1430
- "step": 20300
1431
- },
1432
- {
1433
- "epoch": 0.5956726136595906,
1434
- "grad_norm": 5.028948783874512,
1435
- "learning_rate": 7.42619999157191e-05,
1436
- "loss": 0.5499,
1437
- "step": 20400
1438
- },
1439
- {
1440
- "epoch": 0.5985925774520396,
1441
- "grad_norm": 2.2335968017578125,
1442
- "learning_rate": 7.334930400541347e-05,
1443
- "loss": 0.5178,
1444
- "step": 20500
1445
- },
1446
- {
1447
- "epoch": 0.6015125412444886,
1448
- "grad_norm": 2.491394281387329,
1449
- "learning_rate": 7.243899168730202e-05,
1450
- "loss": 0.5571,
1451
- "step": 20600
1452
- },
1453
- {
1454
- "epoch": 0.6044325050369376,
1455
- "grad_norm": 5.571224212646484,
1456
- "learning_rate": 7.153114437814525e-05,
1457
- "loss": 0.5365,
1458
- "step": 20700
1459
- },
1460
- {
1461
- "epoch": 0.6073524688293865,
1462
- "grad_norm": 3.0990610122680664,
1463
- "learning_rate": 7.062584327423772e-05,
1464
- "loss": 0.505,
1465
- "step": 20800
1466
- },
1467
- {
1468
- "epoch": 0.6102724326218355,
1469
- "grad_norm": 2.507359266281128,
1470
- "learning_rate": 6.972316934414562e-05,
1471
- "loss": 0.5801,
1472
- "step": 20900
1473
- },
1474
- {
1475
- "epoch": 0.6131923964142845,
1476
- "grad_norm": 2.0391249656677246,
1477
- "learning_rate": 6.882320332146542e-05,
1478
- "loss": 0.4986,
1479
- "step": 21000
1480
- },
1481
- {
1482
- "epoch": 0.6161123602067334,
1483
- "grad_norm": 6.61077356338501,
1484
- "learning_rate": 6.793498340758233e-05,
1485
- "loss": 0.5985,
1486
- "step": 21100
1487
- },
1488
- {
1489
- "epoch": 0.6190323239991824,
1490
- "grad_norm": 2.313990831375122,
1491
- "learning_rate": 6.704064534178382e-05,
1492
- "loss": 0.5378,
1493
- "step": 21200
1494
- },
1495
- {
1496
- "epoch": 0.6219522877916314,
1497
- "grad_norm": 3.221607208251953,
1498
- "learning_rate": 6.614925510370827e-05,
1499
- "loss": 0.5767,
1500
- "step": 21300
1501
- },
1502
- {
1503
- "epoch": 0.6248722515840803,
1504
- "grad_norm": 2.1119728088378906,
1505
- "learning_rate": 6.526089241775799e-05,
1506
- "loss": 0.5344,
1507
- "step": 21400
1508
- },
1509
- {
1510
- "epoch": 0.6277922153765293,
1511
- "grad_norm": 6.011439323425293,
1512
- "learning_rate": 6.437563673755653e-05,
1513
- "loss": 0.6197,
1514
- "step": 21500
1515
- },
1516
- {
1517
- "epoch": 0.6307121791689783,
1518
- "grad_norm": 1.3864915370941162,
1519
- "learning_rate": 6.349356723884206e-05,
1520
- "loss": 0.5489,
1521
- "step": 21600
1522
- },
1523
- {
1524
- "epoch": 0.6336321429614272,
1525
- "grad_norm": 2.4754810333251953,
1526
- "learning_rate": 6.261476281238625e-05,
1527
- "loss": 0.4908,
1528
- "step": 21700
1529
- },
1530
- {
1531
- "epoch": 0.6365521067538763,
1532
- "grad_norm": 3.065643787384033,
1533
- "learning_rate": 6.173930205693829e-05,
1534
- "loss": 0.5829,
1535
- "step": 21800
1536
- },
1537
- {
1538
- "epoch": 0.6394720705463253,
1539
- "grad_norm": 4.440421104431152,
1540
- "learning_rate": 6.086726327219525e-05,
1541
- "loss": 0.5175,
1542
- "step": 21900
1543
- },
1544
- {
1545
- "epoch": 0.6423920343387742,
1546
- "grad_norm": 2.5433878898620605,
1547
- "learning_rate": 5.999872445179904e-05,
1548
- "loss": 0.604,
1549
- "step": 22000
1550
- },
1551
- {
1552
- "epoch": 0.6453119981312232,
1553
- "grad_norm": 5.749415397644043,
1554
- "learning_rate": 5.9133763276360855e-05,
1555
- "loss": 0.5274,
1556
- "step": 22100
1557
- },
1558
- {
1559
- "epoch": 0.6482319619236722,
1560
- "grad_norm": 4.130051136016846,
1561
- "learning_rate": 5.8272457106513435e-05,
1562
- "loss": 0.524,
1563
- "step": 22200
1564
- },
1565
- {
1566
- "epoch": 0.6511519257161211,
1567
- "grad_norm": 4.023646831512451,
1568
- "learning_rate": 5.741488297599216e-05,
1569
- "loss": 0.5641,
1570
- "step": 22300
1571
- },
1572
- {
1573
- "epoch": 0.6540718895085701,
1574
- "grad_norm": 3.071547746658325,
1575
- "learning_rate": 5.656111758474528e-05,
1576
- "loss": 0.5693,
1577
- "step": 22400
1578
- },
1579
- {
1580
- "epoch": 0.6569918533010191,
1581
- "grad_norm": 2.316500663757324,
1582
- "learning_rate": 5.571123729207408e-05,
1583
- "loss": 0.4671,
1584
- "step": 22500
1585
- },
1586
- {
1587
- "epoch": 0.659911817093468,
1588
- "grad_norm": 2.743230104446411,
1589
- "learning_rate": 5.4865318109803064e-05,
1590
- "loss": 0.5422,
1591
- "step": 22600
1592
- },
1593
- {
1594
- "epoch": 0.662831780885917,
1595
- "grad_norm": 4.95118522644043,
1596
- "learning_rate": 5.402343569548214e-05,
1597
- "loss": 0.5305,
1598
- "step": 22700
1599
- },
1600
- {
1601
- "epoch": 0.665751744678366,
1602
- "grad_norm": 5.621161460876465,
1603
- "learning_rate": 5.318566534561956e-05,
1604
- "loss": 0.5706,
1605
- "step": 22800
1606
- },
1607
- {
1608
- "epoch": 0.6686717084708149,
1609
- "grad_norm": 6.144950866699219,
1610
- "learning_rate": 5.236039685178543e-05,
1611
- "loss": 0.5763,
1612
- "step": 22900
1613
- },
1614
- {
1615
- "epoch": 0.6715916722632639,
1616
- "grad_norm": 2.8691632747650146,
1617
- "learning_rate": 5.1531032059287074e-05,
1618
- "loss": 0.5099,
1619
- "step": 23000
1620
- },
1621
- {
1622
- "epoch": 0.674511636055713,
1623
- "grad_norm": 1.8906828165054321,
1624
- "learning_rate": 5.070600224752108e-05,
1625
- "loss": 0.5564,
1626
- "step": 23100
1627
- },
1628
- {
1629
- "epoch": 0.6774315998481619,
1630
- "grad_norm": 2.1284193992614746,
1631
- "learning_rate": 4.9885381205728124e-05,
1632
- "loss": 0.5256,
1633
- "step": 23200
1634
- },
1635
- {
1636
- "epoch": 0.6803515636406109,
1637
- "grad_norm": 8.401644706726074,
1638
- "learning_rate": 4.9069242328835974e-05,
1639
- "loss": 0.5609,
1640
- "step": 23300
1641
- },
1642
- {
1643
- "epoch": 0.6832715274330599,
1644
- "grad_norm": 3.2185475826263428,
1645
- "learning_rate": 4.8257658610895485e-05,
1646
- "loss": 0.5466,
1647
- "step": 23400
1648
- },
1649
- {
1650
- "epoch": 0.6861914912255088,
1651
- "grad_norm": 1.7117905616760254,
1652
- "learning_rate": 4.7450702638551795e-05,
1653
- "loss": 0.532,
1654
- "step": 23500
1655
- },
1656
- {
1657
- "epoch": 0.6891114550179578,
1658
- "grad_norm": 5.279407501220703,
1659
- "learning_rate": 4.664844658455262e-05,
1660
- "loss": 0.4883,
1661
- "step": 23600
1662
- },
1663
- {
1664
- "epoch": 0.6920314188104068,
1665
- "grad_norm": 3.187605619430542,
1666
- "learning_rate": 4.5850962201292956e-05,
1667
- "loss": 0.5377,
1668
- "step": 23700
1669
- },
1670
- {
1671
- "epoch": 0.6949513826028557,
1672
- "grad_norm": 3.679347515106201,
1673
- "learning_rate": 4.5058320814397984e-05,
1674
- "loss": 0.5999,
1675
- "step": 23800
1676
- },
1677
- {
1678
- "epoch": 0.6978713463953047,
1679
- "grad_norm": 2.252751588821411,
1680
- "learning_rate": 4.427059331634348e-05,
1681
- "loss": 0.479,
1682
- "step": 23900
1683
- },
1684
- {
1685
- "epoch": 0.7007913101877536,
1686
- "grad_norm": 4.126784801483154,
1687
- "learning_rate": 4.348785016011566e-05,
1688
- "loss": 0.5708,
1689
- "step": 24000
1690
- },
1691
- {
1692
- "epoch": 0.7037112739802026,
1693
- "grad_norm": 2.982395648956299,
1694
- "learning_rate": 4.2710161352909674e-05,
1695
- "loss": 0.4497,
1696
- "step": 24100
1697
- },
1698
- {
1699
- "epoch": 0.7066312377726516,
1700
- "grad_norm": 2.803725242614746,
1701
- "learning_rate": 4.193759644986852e-05,
1702
- "loss": 0.526,
1703
- "step": 24200
1704
- },
1705
- {
1706
- "epoch": 0.7095512015651005,
1707
- "grad_norm": 2.7953624725341797,
1708
- "learning_rate": 4.117022454786209e-05,
1709
- "loss": 0.5743,
1710
- "step": 24300
1711
- },
1712
- {
1713
- "epoch": 0.7124711653575496,
1714
- "grad_norm": 3.2696502208709717,
1715
- "learning_rate": 4.040811427930715e-05,
1716
- "loss": 0.531,
1717
- "step": 24400
1718
- },
1719
- {
1720
- "epoch": 0.7153911291499986,
1721
- "grad_norm": 5.39483118057251,
1722
- "learning_rate": 3.9651333806029066e-05,
1723
- "loss": 0.6086,
1724
- "step": 24500
1725
- },
1726
- {
1727
- "epoch": 0.7183110929424475,
1728
- "grad_norm": 2.236680030822754,
1729
- "learning_rate": 3.8899950813165595e-05,
1730
- "loss": 0.538,
1731
- "step": 24600
1732
- },
1733
- {
1734
- "epoch": 0.7212310567348965,
1735
- "grad_norm": 5.033971786499023,
1736
- "learning_rate": 3.815403250311317e-05,
1737
- "loss": 0.5001,
1738
- "step": 24700
1739
- },
1740
- {
1741
- "epoch": 0.7241510205273455,
1742
- "grad_norm": 3.3246283531188965,
1743
- "learning_rate": 3.741364558951638e-05,
1744
- "loss": 0.5278,
1745
- "step": 24800
1746
- },
1747
- {
1748
- "epoch": 0.7270709843197944,
1749
- "grad_norm": 1.9075288772583008,
1750
- "learning_rate": 3.667885629130124e-05,
1751
- "loss": 0.5903,
1752
- "step": 24900
1753
- },
1754
- {
1755
- "epoch": 0.7299909481122434,
1756
- "grad_norm": 3.299224853515625,
1757
- "learning_rate": 3.5949730326752854e-05,
1758
- "loss": 0.4959,
1759
- "step": 25000
1760
- },
1761
- {
1762
- "epoch": 0.7329109119046924,
1763
- "grad_norm": 3.502321720123291,
1764
- "learning_rate": 3.522633290763746e-05,
1765
- "loss": 0.5437,
1766
- "step": 25100
1767
- },
1768
- {
1769
- "epoch": 0.7358308756971413,
1770
- "grad_norm": 6.021385192871094,
1771
- "learning_rate": 3.45087287333701e-05,
1772
- "loss": 0.5548,
1773
- "step": 25200
1774
- },
1775
- {
1776
- "epoch": 0.7387508394895903,
1777
- "grad_norm": 2.9654223918914795,
1778
- "learning_rate": 3.379698198522798e-05,
1779
- "loss": 0.5051,
1780
- "step": 25300
1781
- },
1782
- {
1783
- "epoch": 0.7416708032820393,
1784
- "grad_norm": 5.164859294891357,
1785
- "learning_rate": 3.30911563206103e-05,
1786
- "loss": 0.5165,
1787
- "step": 25400
1788
- },
1789
- {
1790
- "epoch": 0.7445907670744882,
1791
- "grad_norm": 3.9957275390625,
1792
- "learning_rate": 3.239131486734478e-05,
1793
- "loss": 0.4851,
1794
- "step": 25500
1795
- },
1796
- {
1797
- "epoch": 0.7475107308669372,
1798
- "grad_norm": 4.626624584197998,
1799
- "learning_rate": 3.1697520218041555e-05,
1800
- "loss": 0.5368,
1801
- "step": 25600
1802
- },
1803
- {
1804
- "epoch": 0.7504306946593863,
1805
- "grad_norm": 2.424008846282959,
1806
- "learning_rate": 3.100983442449506e-05,
1807
- "loss": 0.4919,
1808
- "step": 25700
1809
- },
1810
- {
1811
- "epoch": 0.7533506584518352,
1812
- "grad_norm": 2.843548536300659,
1813
- "learning_rate": 3.032831899213433e-05,
1814
- "loss": 0.5479,
1815
- "step": 25800
1816
- },
1817
- {
1818
- "epoch": 0.7562706222442842,
1819
- "grad_norm": 2.585984945297241,
1820
- "learning_rate": 2.965303487452189e-05,
1821
- "loss": 0.4914,
1822
- "step": 25900
1823
- },
1824
- {
1825
- "epoch": 0.7591905860367332,
1826
- "grad_norm": 2.7971699237823486,
1827
- "learning_rate": 2.8984042467902207e-05,
1828
- "loss": 0.5433,
1829
- "step": 26000
1830
- },
1831
- {
1832
- "epoch": 0.7621105498291821,
1833
- "grad_norm": 2.5568065643310547,
1834
- "learning_rate": 2.832140160580007e-05,
1835
- "loss": 0.5143,
1836
- "step": 26100
1837
- },
1838
- {
1839
- "epoch": 0.7650305136216311,
1840
- "grad_norm": 2.1987948417663574,
1841
- "learning_rate": 2.7665171553669023e-05,
1842
- "loss": 0.47,
1843
- "step": 26200
1844
- },
1845
- {
1846
- "epoch": 0.7679504774140801,
1847
- "grad_norm": 2.724154233932495,
1848
- "learning_rate": 2.701541100359093e-05,
1849
- "loss": 0.4984,
1850
- "step": 26300
1851
- },
1852
- {
1853
- "epoch": 0.770870441206529,
1854
- "grad_norm": 3.9086155891418457,
1855
- "learning_rate": 2.6372178069026464e-05,
1856
- "loss": 0.5805,
1857
- "step": 26400
1858
- },
1859
- {
1860
- "epoch": 0.773790404998978,
1861
- "grad_norm": 3.4147980213165283,
1862
- "learning_rate": 2.573553027961778e-05,
1863
- "loss": 0.4967,
1864
- "step": 26500
1865
- },
1866
- {
1867
- "epoch": 0.776710368791427,
1868
- "grad_norm": 6.612050533294678,
1869
- "learning_rate": 2.5105524576042894e-05,
1870
- "loss": 0.5695,
1871
- "step": 26600
1872
- },
1873
- {
1874
- "epoch": 0.7796303325838759,
1875
- "grad_norm": 3.5038676261901855,
1876
- "learning_rate": 2.4482217304923273e-05,
1877
- "loss": 0.5575,
1878
- "step": 26700
1879
- },
1880
- {
1881
- "epoch": 0.7825502963763249,
1882
- "grad_norm": 1.891401767730713,
1883
- "learning_rate": 2.3865664213784024e-05,
1884
- "loss": 0.5505,
1885
- "step": 26800
1886
- },
1887
- {
1888
- "epoch": 0.7854702601687739,
1889
- "grad_norm": 2.7758102416992188,
1890
- "learning_rate": 2.32559204460682e-05,
1891
- "loss": 0.548,
1892
- "step": 26900
1893
- },
1894
- {
1895
- "epoch": 0.7883902239612229,
1896
- "grad_norm": 8.392036437988281,
1897
- "learning_rate": 2.2653040536204616e-05,
1898
- "loss": 0.5254,
1899
- "step": 27000
1900
- },
1901
- {
1902
- "epoch": 0.7913101877536719,
1903
- "grad_norm": 1.787404179573059,
1904
- "learning_rate": 2.2057078404730637e-05,
1905
- "loss": 0.5049,
1906
- "step": 27100
1907
- },
1908
- {
1909
- "epoch": 0.7942301515461209,
1910
- "grad_norm": 4.141063213348389,
1911
- "learning_rate": 2.1468087353469336e-05,
1912
- "loss": 0.5384,
1913
- "step": 27200
1914
- },
1915
- {
1916
- "epoch": 0.7971501153385698,
1917
- "grad_norm": 4.328978538513184,
1918
- "learning_rate": 2.0886120060762516e-05,
1919
- "loss": 0.5508,
1920
- "step": 27300
1921
- },
1922
- {
1923
- "epoch": 0.8000700791310188,
1924
- "grad_norm": 2.4056620597839355,
1925
- "learning_rate": 2.0311228576759157e-05,
1926
- "loss": 0.5034,
1927
- "step": 27400
1928
- },
1929
- {
1930
- "epoch": 0.8029900429234678,
1931
- "grad_norm": 2.209656000137329,
1932
- "learning_rate": 1.9743464318760087e-05,
1933
- "loss": 0.585,
1934
- "step": 27500
1935
- },
1936
- {
1937
- "epoch": 0.8059100067159167,
1938
- "grad_norm": 3.6636762619018555,
1939
- "learning_rate": 1.9182878066619337e-05,
1940
- "loss": 0.5427,
1941
- "step": 27600
1942
- },
1943
- {
1944
- "epoch": 0.8088299705083657,
1945
- "grad_norm": 3.2900936603546143,
1946
- "learning_rate": 1.862951995820257e-05,
1947
- "loss": 0.546,
1948
- "step": 27700
1949
- },
1950
- {
1951
- "epoch": 0.8117499343008147,
1952
- "grad_norm": 1.6237975358963013,
1953
- "learning_rate": 1.8083439484902754e-05,
1954
- "loss": 0.51,
1955
- "step": 27800
1956
- },
1957
- {
1958
- "epoch": 0.8146698980932636,
1959
- "grad_norm": 4.96632719039917,
1960
- "learning_rate": 1.7544685487213718e-05,
1961
- "loss": 0.5784,
1962
- "step": 27900
1963
- },
1964
- {
1965
- "epoch": 0.8175898618857126,
1966
- "grad_norm": 4.221993446350098,
1967
- "learning_rate": 1.701330615036194e-05,
1968
- "loss": 0.5209,
1969
- "step": 28000
1970
- },
1971
- {
1972
- "epoch": 0.8205098256781616,
1973
- "grad_norm": 3.626847982406616,
1974
- "learning_rate": 1.6489348999997068e-05,
1975
- "loss": 0.4913,
1976
- "step": 28100
1977
- },
1978
- {
1979
- "epoch": 0.8234297894706105,
1980
- "grad_norm": 4.361175537109375,
1981
- "learning_rate": 1.5972860897941167e-05,
1982
- "loss": 0.5247,
1983
- "step": 28200
1984
- },
1985
- {
1986
- "epoch": 0.8263497532630595,
1987
- "grad_norm": 4.415472507476807,
1988
- "learning_rate": 1.5463888037997522e-05,
1989
- "loss": 0.5272,
1990
- "step": 28300
1991
- },
1992
- {
1993
- "epoch": 0.8292697170555086,
1994
- "grad_norm": 3.3726589679718018,
1995
- "learning_rate": 1.4962475941819132e-05,
1996
- "loss": 0.474,
1997
- "step": 28400
1998
- },
1999
- {
2000
- "epoch": 0.8321896808479575,
2001
- "grad_norm": 1.9652336835861206,
2002
- "learning_rate": 1.4468669454837402e-05,
2003
- "loss": 0.5134,
2004
- "step": 28500
2005
- },
2006
- {
2007
- "epoch": 0.8351096446404065,
2008
- "grad_norm": 2.4251391887664795,
2009
- "learning_rate": 1.3982512742251197e-05,
2010
- "loss": 0.5433,
2011
- "step": 28600
2012
- },
2013
- {
2014
- "epoch": 0.8380296084328555,
2015
- "grad_norm": 1.2398502826690674,
2016
- "learning_rate": 1.3504049285076725e-05,
2017
- "loss": 0.5545,
2018
- "step": 28700
2019
- },
2020
- {
2021
- "epoch": 0.8409495722253044,
2022
- "grad_norm": 3.1720731258392334,
2023
- "learning_rate": 1.3033321876258742e-05,
2024
- "loss": 0.5603,
2025
- "step": 28800
2026
- },
2027
- {
2028
- "epoch": 0.8438695360177534,
2029
- "grad_norm": 2.1568796634674072,
2030
- "learning_rate": 1.2570372616843229e-05,
2031
- "loss": 0.472,
2032
- "step": 28900
2033
- },
2034
- {
2035
- "epoch": 0.8467894998102024,
2036
- "grad_norm": 2.8056724071502686,
2037
- "learning_rate": 1.211975536822222e-05,
2038
- "loss": 0.5052,
2039
- "step": 29000
2040
- },
2041
- {
2042
- "epoch": 0.8497094636026513,
2043
- "grad_norm": 1.3424701690673828,
2044
- "learning_rate": 1.1672407122591966e-05,
2045
- "loss": 0.4589,
2046
- "step": 29100
2047
- },
2048
- {
2049
- "epoch": 0.8526294273951003,
2050
- "grad_norm": 2.5995755195617676,
2051
- "learning_rate": 1.123295874422794e-05,
2052
- "loss": 0.5132,
2053
- "step": 29200
2054
- },
2055
- {
2056
- "epoch": 0.8555493911875492,
2057
- "grad_norm": 3.993947744369507,
2058
- "learning_rate": 1.0801449536633657e-05,
2059
- "loss": 0.4967,
2060
- "step": 29300
2061
- },
2062
- {
2063
- "epoch": 0.8584693549799982,
2064
- "grad_norm": 1.6350364685058594,
2065
- "learning_rate": 1.0377918093246986e-05,
2066
- "loss": 0.4799,
2067
- "step": 29400
2068
- },
2069
- {
2070
- "epoch": 0.8613893187724472,
2071
- "grad_norm": 3.297436237335205,
2072
- "learning_rate": 9.96240229398826e-06,
2073
- "loss": 0.4838,
2074
- "step": 29500
2075
- },
2076
- {
2077
- "epoch": 0.8643092825648961,
2078
- "grad_norm": 2.4876105785369873,
2079
- "learning_rate": 9.554939301872545e-06,
2080
- "loss": 0.4792,
2081
- "step": 29600
2082
- },
2083
- {
2084
- "epoch": 0.8672292463573452,
2085
- "grad_norm": 3.8208444118499756,
2086
- "learning_rate": 9.155565559685697e-06,
2087
- "loss": 0.5394,
2088
- "step": 29700
2089
- },
2090
- {
2091
- "epoch": 0.8701492101497942,
2092
- "grad_norm": 1.8198366165161133,
2093
- "learning_rate": 8.764316786725013e-06,
2094
- "loss": 0.4818,
2095
- "step": 29800
2096
- },
2097
- {
2098
- "epoch": 0.8730691739422431,
2099
- "grad_norm": 4.755416393280029,
2100
- "learning_rate": 8.381227975604677e-06,
2101
- "loss": 0.4975,
2102
- "step": 29900
2103
- },
2104
- {
2105
- "epoch": 0.8759891377346921,
2106
- "grad_norm": 2.5053181648254395,
2107
- "learning_rate": 8.006333389125908e-06,
2108
- "loss": 0.4599,
2109
- "step": 30000
2110
- },
2111
- {
2112
- "epoch": 0.8789091015271411,
2113
- "grad_norm": NaN,
2114
- "learning_rate": 7.643292389855882e-06,
2115
- "loss": 0.5384,
2116
- "step": 30100
2117
- },
2118
- {
2119
- "epoch": 0.88182906531959,
2120
- "grad_norm": 5.616009712219238,
2121
- "learning_rate": 7.28480334116276e-06,
2122
- "loss": 0.5446,
2123
- "step": 30200
2124
- },
2125
- {
2126
- "epoch": 0.884749029112039,
2127
- "grad_norm": 2.7977347373962402,
2128
- "learning_rate": 6.934606579435754e-06,
2129
- "loss": 0.5179,
2130
- "step": 30300
2131
- },
2132
- {
2133
- "epoch": 0.887668992904488,
2134
- "grad_norm": 2.461843490600586,
2135
- "learning_rate": 6.5927334256680735e-06,
2136
- "loss": 0.5078,
2137
- "step": 30400
2138
- },
2139
- {
2140
- "epoch": 0.8905889566969369,
2141
- "grad_norm": 5.445867538452148,
2142
- "learning_rate": 6.259214456403772e-06,
2143
- "loss": 0.6466,
2144
- "step": 30500
2145
- },
2146
- {
2147
- "epoch": 0.8935089204893859,
2148
- "grad_norm": 2.652655601501465,
2149
- "learning_rate": 5.934079501003087e-06,
2150
- "loss": 0.5021,
2151
- "step": 30600
2152
- },
2153
- {
2154
- "epoch": 0.8964288842818349,
2155
- "grad_norm": 2.0053634643554688,
2156
- "learning_rate": 5.617357638974574e-06,
2157
- "loss": 0.4928,
2158
- "step": 30700
2159
- },
2160
- {
2161
- "epoch": 0.8993488480742838,
2162
- "grad_norm": 2.91902232170105,
2163
- "learning_rate": 5.309077197374246e-06,
2164
- "loss": 0.5492,
2165
- "step": 30800
2166
- },
2167
- {
2168
- "epoch": 0.9022688118667328,
2169
- "grad_norm": 2.4080049991607666,
2170
- "learning_rate": 5.009265748272041e-06,
2171
- "loss": 0.4833,
2172
- "step": 30900
2173
- },
2174
- {
2175
- "epoch": 0.9051887756591819,
2176
- "grad_norm": 2.9708127975463867,
2177
- "learning_rate": 4.717950106285862e-06,
2178
- "loss": 0.5559,
2179
- "step": 31000
2180
- },
2181
- {
2182
- "epoch": 0.9081087394516308,
2183
- "grad_norm": 1.8519601821899414,
2184
- "learning_rate": 4.4351563261832984e-06,
2185
- "loss": 0.5401,
2186
- "step": 31100
2187
- },
2188
- {
2189
- "epoch": 0.9110287032440798,
2190
- "grad_norm": 4.341070652008057,
2191
- "learning_rate": 4.160909700551341e-06,
2192
- "loss": 0.4997,
2193
- "step": 31200
2194
- },
2195
- {
2196
- "epoch": 0.9139486670365288,
2197
- "grad_norm": 2.3663878440856934,
2198
- "learning_rate": 3.895234757534249e-06,
2199
- "loss": 0.4632,
2200
- "step": 31300
2201
- },
2202
- {
2203
- "epoch": 0.9168686308289777,
2204
- "grad_norm": 7.448829174041748,
2205
- "learning_rate": 3.6381552586398193e-06,
2206
- "loss": 0.4853,
2207
- "step": 31400
2208
- },
2209
- {
2210
- "epoch": 0.9197885946214267,
2211
- "grad_norm": 5.03635311126709,
2212
- "learning_rate": 3.392136072366148e-06,
2213
- "loss": 0.57,
2214
- "step": 31500
2215
- },
2216
- {
2217
- "epoch": 0.9227085584138757,
2218
- "grad_norm": 1.9699945449829102,
2219
- "learning_rate": 3.1522291550866503e-06,
2220
- "loss": 0.5089,
2221
- "step": 31600
2222
- },
2223
- {
2224
- "epoch": 0.9256285222063246,
2225
- "grad_norm": 2.2074334621429443,
2226
- "learning_rate": 2.9209841350668955e-06,
2227
- "loss": 0.5101,
2228
- "step": 31700
2229
- },
2230
- {
2231
- "epoch": 0.9285484859987736,
2232
- "grad_norm": 4.090612888336182,
2233
- "learning_rate": 2.6984216944618502e-06,
2234
- "loss": 0.5134,
2235
- "step": 31800
2236
- },
2237
- {
2238
- "epoch": 0.9314684497912226,
2239
- "grad_norm": 4.454416751861572,
2240
- "learning_rate": 2.4845617388715823e-06,
2241
- "loss": 0.4935,
2242
- "step": 31900
2243
- },
2244
- {
2245
- "epoch": 0.9343884135836715,
2246
- "grad_norm": 2.1300718784332275,
2247
- "learning_rate": 2.2794233955609425e-06,
2248
- "loss": 0.5529,
2249
- "step": 32000
2250
- },
2251
- {
2252
- "epoch": 0.9373083773761205,
2253
- "grad_norm": 2.274291753768921,
2254
- "learning_rate": 2.0830250117488627e-06,
2255
- "loss": 0.5482,
2256
- "step": 32100
2257
- },
2258
- {
2259
- "epoch": 0.9402283411685695,
2260
- "grad_norm": 8.231354713439941,
2261
- "learning_rate": 1.8953841529673722e-06,
2262
- "loss": 0.5735,
2263
- "step": 32200
2264
- },
2265
- {
2266
- "epoch": 0.9431483049610185,
2267
- "grad_norm": 5.990418910980225,
2268
- "learning_rate": 1.7165176014906415e-06,
2269
- "loss": 0.5678,
2270
- "step": 32300
2271
- },
2272
- {
2273
- "epoch": 0.9460682687534675,
2274
- "grad_norm": 2.1669516563415527,
2275
- "learning_rate": 1.5464413548339162e-06,
2276
- "loss": 0.4608,
2277
- "step": 32400
2278
- },
2279
- {
2280
- "epoch": 0.9489882325459165,
2281
- "grad_norm": 4.036328315734863,
2282
- "learning_rate": 1.3851706243227736e-06,
2283
- "loss": 0.486,
2284
- "step": 32500
2285
- },
2286
- {
2287
- "epoch": 0.9519081963383654,
2288
- "grad_norm": 3.1790454387664795,
2289
- "learning_rate": 1.2327198337326762e-06,
2290
- "loss": 0.5589,
2291
- "step": 32600
2292
- },
2293
- {
2294
- "epoch": 0.9548281601308144,
2295
- "grad_norm": 4.643735408782959,
2296
- "learning_rate": 1.0891026179988718e-06,
2297
- "loss": 0.5459,
2298
- "step": 32700
2299
- },
2300
- {
2301
- "epoch": 0.9577481239232634,
2302
- "grad_norm": 2.757411479949951,
2303
- "learning_rate": 9.543318219969565e-07,
2304
- "loss": 0.5064,
2305
- "step": 32800
2306
- },
2307
- {
2308
- "epoch": 0.9606680877157123,
2309
- "grad_norm": 9.022377014160156,
2310
- "learning_rate": 8.284194993940065e-07,
2311
- "loss": 0.6064,
2312
- "step": 32900
2313
- },
2314
- {
2315
- "epoch": 0.9635880515081613,
2316
- "grad_norm": 4.3898210525512695,
2317
- "learning_rate": 7.113769115705715e-07,
2318
- "loss": 0.5154,
2319
- "step": 33000
2320
- },
2321
- {
2322
- "epoch": 0.9665080153006103,
2323
- "grad_norm": 1.2006958723068237,
2324
- "learning_rate": 6.032145266134493e-07,
2325
- "loss": 0.5003,
2326
- "step": 33100
2327
- },
2328
- {
2329
- "epoch": 0.9694279790930592,
2330
- "grad_norm": 3.930570602416992,
2331
- "learning_rate": 5.03942018379433e-07,
2332
- "loss": 0.5025,
2333
- "step": 33200
2334
- },
2335
- {
2336
- "epoch": 0.9723479428855082,
2337
- "grad_norm": 1.9354479312896729,
2338
- "learning_rate": 4.135682656300821e-07,
2339
- "loss": 0.5043,
2340
- "step": 33300
2341
- },
2342
- {
2343
- "epoch": 0.9752679066779572,
2344
- "grad_norm": 3.8160500526428223,
2345
- "learning_rate": 3.3210135123770137e-07,
2346
- "loss": 0.5009,
2347
- "step": 33400
2348
- },
2349
- {
2350
- "epoch": 0.9781878704704061,
2351
- "grad_norm": 6.7679123878479,
2352
- "learning_rate": 2.5954856146229723e-07,
2353
- "loss": 0.5898,
2354
- "step": 33500
2355
- },
2356
- {
2357
- "epoch": 0.9811078342628552,
2358
- "grad_norm": 2.734801769256592,
2359
- "learning_rate": 1.9591638530003232e-07,
2360
- "loss": 0.54,
2361
- "step": 33600
2362
- },
2363
- {
2364
- "epoch": 0.9840277980553042,
2365
- "grad_norm": 3.867943048477173,
2366
- "learning_rate": 1.4121051390276752e-07,
2367
- "loss": 0.5046,
2368
- "step": 33700
2369
- },
2370
- {
2371
- "epoch": 0.9869477618477531,
2372
- "grad_norm": 2.5671937465667725,
2373
- "learning_rate": 9.543584006909134e-08,
2374
- "loss": 0.609,
2375
- "step": 33800
2376
- },
2377
- {
2378
- "epoch": 0.9898677256402021,
2379
- "grad_norm": 3.0211405754089355,
2380
- "learning_rate": 5.859645780674772e-08,
2381
- "loss": 0.5742,
2382
- "step": 33900
2383
- },
2384
- {
2385
- "epoch": 0.992787689432651,
2386
- "grad_norm": 2.964229106903076,
2387
- "learning_rate": 3.069566196640672e-08,
2388
- "loss": 0.5081,
2389
- "step": 34000
2390
- },
2391
- {
2392
- "epoch": 0.9957076532251,
2393
- "grad_norm": 2.8453125953674316,
2394
- "learning_rate": 1.173594794704469e-08,
2395
- "loss": 0.5484,
2396
- "step": 34100
2397
- },
2398
- {
2399
- "epoch": 0.998627617017549,
2400
- "grad_norm": 2.5372684001922607,
2401
- "learning_rate": 1.7190114727116957e-09,
2402
- "loss": 0.4831,
2403
- "step": 34200
2404
  }
2405
  ],
2406
  "logging_steps": 100,
2407
- "max_steps": 34247,
2408
  "num_input_tokens_seen": 0,
2409
- "num_train_epochs": 1,
2410
  "save_steps": 0,
2411
  "stateful_callbacks": {
2412
  "TrainerControl": {
@@ -2420,7 +1223,7 @@
2420
  "attributes": {}
2421
  }
2422
  },
2423
- "total_flos": 2.5026883378073395e+17,
2424
  "train_batch_size": 8,
2425
  "trial_name": null,
2426
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 17125,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.029197080291970802,
13
+ "grad_norm": 0.8470640182495117,
14
+ "learning_rate": 3.891050583657588e-05,
15
+ "loss": 4.8508,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.058394160583941604,
20
+ "grad_norm": 1.3564280271530151,
21
+ "learning_rate": 7.782101167315176e-05,
22
+ "loss": 4.4557,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.08759124087591241,
27
+ "grad_norm": 1.3530114889144897,
28
+ "learning_rate": 0.00011673151750972763,
29
+ "loss": 4.1905,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.11678832116788321,
34
+ "grad_norm": 1.328213095664978,
35
+ "learning_rate": 0.0001556420233463035,
36
+ "loss": 4.1219,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.145985401459854,
41
+ "grad_norm": 1.3318136930465698,
42
+ "learning_rate": 0.0001945525291828794,
43
+ "loss": 3.9169,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 0.17518248175182483,
48
+ "grad_norm": 1.7535721063613892,
49
+ "learning_rate": 0.00019998677287306197,
50
+ "loss": 3.8635,
51
  "step": 600
52
  },
53
  {
54
+ "epoch": 0.20437956204379562,
55
+ "grad_norm": 1.6469675302505493,
56
+ "learning_rate": 0.00019993813296600207,
57
+ "loss": 3.775,
58
  "step": 700
59
  },
60
  {
61
+ "epoch": 0.23357664233576642,
62
+ "grad_norm": 1.637732744216919,
63
+ "learning_rate": 0.00019985374713924855,
64
+ "loss": 3.6853,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 0.26277372262773724,
69
+ "grad_norm": 1.2182079553604126,
70
+ "learning_rate": 0.00019973364557596465,
71
+ "loss": 3.6172,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 0.291970802919708,
76
+ "grad_norm": 1.8510034084320068,
77
+ "learning_rate": 0.00019957787123413558,
78
+ "loss": 3.5044,
79
  "step": 1000
80
  },
81
  {
82
+ "epoch": 0.32116788321167883,
83
+ "grad_norm": 1.4988285303115845,
84
+ "learning_rate": 0.00019938647983120316,
85
+ "loss": 3.445,
86
  "step": 1100
87
  },
88
  {
89
+ "epoch": 0.35036496350364965,
90
+ "grad_norm": 2.2497406005859375,
91
+ "learning_rate": 0.0001991595398241369,
92
+ "loss": 3.4068,
93
  "step": 1200
94
  },
95
  {
96
+ "epoch": 0.3795620437956204,
97
+ "grad_norm": 2.1076061725616455,
98
+ "learning_rate": 0.00019889713238494824,
99
+ "loss": 3.275,
100
  "step": 1300
101
  },
102
  {
103
+ "epoch": 0.40875912408759124,
104
+ "grad_norm": 2.4178290367126465,
105
+ "learning_rate": 0.0001985993513716568,
106
+ "loss": 3.1974,
107
  "step": 1400
108
  },
109
  {
110
+ "epoch": 0.43795620437956206,
111
+ "grad_norm": 1.8352422714233398,
112
+ "learning_rate": 0.00019826630329471928,
113
+ "loss": 3.1553,
114
  "step": 1500
115
  },
116
  {
117
+ "epoch": 0.46715328467153283,
118
+ "grad_norm": 1.7513710260391235,
119
+ "learning_rate": 0.00019789810727893284,
120
+ "loss": 3.0717,
121
  "step": 1600
122
  },
123
  {
124
+ "epoch": 0.49635036496350365,
125
+ "grad_norm": 1.8882313966751099,
126
+ "learning_rate": 0.00019749489502082632,
127
+ "loss": 3.0037,
128
  "step": 1700
129
  },
130
  {
131
+ "epoch": 0.5255474452554745,
132
+ "grad_norm": 2.062272310256958,
133
+ "learning_rate": 0.000197056810741555,
134
+ "loss": 2.9222,
135
  "step": 1800
136
  },
137
  {
138
+ "epoch": 0.5547445255474452,
139
+ "grad_norm": 1.6502212285995483,
140
+ "learning_rate": 0.00019658401113531565,
141
+ "loss": 2.8871,
142
  "step": 1900
143
  },
144
  {
145
+ "epoch": 0.583941605839416,
146
+ "grad_norm": 1.6748732328414917,
147
+ "learning_rate": 0.0001960766653132999,
148
+ "loss": 2.8554,
149
  "step": 2000
150
  },
151
  {
152
+ "epoch": 0.6131386861313869,
153
+ "grad_norm": 1.9615834951400757,
154
+ "learning_rate": 0.0001955349547432065,
155
+ "loss": 2.7841,
156
  "step": 2100
157
  },
158
  {
159
+ "epoch": 0.6423357664233577,
160
+ "grad_norm": 3.604374885559082,
161
+ "learning_rate": 0.000194959073184334,
162
+ "loss": 2.6938,
163
  "step": 2200
164
  },
165
  {
166
+ "epoch": 0.6715328467153284,
167
+ "grad_norm": 2.6139862537384033,
168
+ "learning_rate": 0.00019434922661827663,
169
+ "loss": 2.634,
170
  "step": 2300
171
  },
172
  {
173
+ "epoch": 0.7007299270072993,
174
+ "grad_norm": 2.5986483097076416,
175
+ "learning_rate": 0.00019370563317524882,
176
+ "loss": 2.5901,
177
  "step": 2400
178
  },
179
  {
180
+ "epoch": 0.7299270072992701,
181
+ "grad_norm": 2.7669074535369873,
182
+ "learning_rate": 0.00019302852305606432,
183
+ "loss": 2.5001,
184
  "step": 2500
185
  },
186
  {
187
+ "epoch": 0.7591240875912408,
188
+ "grad_norm": 2.4265894889831543,
189
+ "learning_rate": 0.00019231813844979777,
190
+ "loss": 2.4595,
191
  "step": 2600
192
  },
193
  {
194
+ "epoch": 0.7883211678832117,
195
+ "grad_norm": 2.589481830596924,
196
+ "learning_rate": 0.0001915747334471584,
197
+ "loss": 2.3496,
198
  "step": 2700
199
  },
200
  {
201
+ "epoch": 0.8175182481751825,
202
+ "grad_norm": 2.969148874282837,
203
+ "learning_rate": 0.0001907985739496068,
204
+ "loss": 2.3099,
205
  "step": 2800
206
  },
207
  {
208
+ "epoch": 0.8467153284671532,
209
+ "grad_norm": 2.7729787826538086,
210
+ "learning_rate": 0.00018998993757424713,
211
+ "loss": 2.2896,
212
  "step": 2900
213
  },
214
  {
215
+ "epoch": 0.8759124087591241,
216
+ "grad_norm": 3.3500335216522217,
217
+ "learning_rate": 0.00018914911355452895,
218
+ "loss": 2.2433,
219
  "step": 3000
220
  },
221
  {
222
+ "epoch": 0.9051094890510949,
223
+ "grad_norm": 2.8922252655029297,
224
+ "learning_rate": 0.00018827640263679394,
225
+ "loss": 2.1564,
226
  "step": 3100
227
  },
228
  {
229
+ "epoch": 0.9343065693430657,
230
+ "grad_norm": 2.678309679031372,
231
+ "learning_rate": 0.0001873721169727048,
232
+ "loss": 2.1654,
233
  "step": 3200
234
  },
235
  {
236
+ "epoch": 0.9635036496350365,
237
+ "grad_norm": 3.6411967277526855,
238
+ "learning_rate": 0.00018643658000759493,
239
+ "loss": 2.1212,
240
  "step": 3300
241
  },
242
  {
243
+ "epoch": 0.9927007299270073,
244
+ "grad_norm": 2.9299466609954834,
245
+ "learning_rate": 0.0001854701263647781,
246
+ "loss": 1.9858,
247
  "step": 3400
248
  },
249
  {
250
+ "epoch": 1.0218978102189782,
251
+ "grad_norm": 1.6852861642837524,
252
+ "learning_rate": 0.0001844731017258603,
253
+ "loss": 1.953,
254
  "step": 3500
255
  },
256
  {
257
+ "epoch": 1.051094890510949,
258
+ "grad_norm": 2.1536929607391357,
259
+ "learning_rate": 0.00018344586270709613,
260
+ "loss": 1.8913,
261
  "step": 3600
262
  },
263
  {
264
+ "epoch": 1.0802919708029197,
265
+ "grad_norm": 1.5574983358383179,
266
+ "learning_rate": 0.00018238877673183428,
267
+ "loss": 1.8142,
268
  "step": 3700
269
  },
270
  {
271
+ "epoch": 1.1094890510948905,
272
+ "grad_norm": 1.319333791732788,
273
+ "learning_rate": 0.0001813022218990972,
274
+ "loss": 1.7464,
275
  "step": 3800
276
  },
277
  {
278
+ "epoch": 1.1386861313868613,
279
+ "grad_norm": 1.4836809635162354,
280
+ "learning_rate": 0.00018018658684834256,
281
+ "loss": 1.7574,
282
  "step": 3900
283
  },
284
  {
285
+ "epoch": 1.167883211678832,
286
+ "grad_norm": 1.7886255979537964,
287
+ "learning_rate": 0.00017904227062045437,
288
+ "loss": 1.7255,
289
  "step": 4000
290
  },
291
  {
292
+ "epoch": 1.197080291970803,
293
+ "grad_norm": 1.7846744060516357,
294
+ "learning_rate": 0.00017786968251501406,
295
+ "loss": 1.7593,
296
  "step": 4100
297
  },
298
  {
299
+ "epoch": 1.2262773722627738,
300
+ "grad_norm": 1.5439305305480957,
301
+ "learning_rate": 0.00017666924194390183,
302
+ "loss": 1.7175,
303
  "step": 4200
304
  },
305
  {
306
+ "epoch": 1.2554744525547445,
307
+ "grad_norm": 1.6620721817016602,
308
+ "learning_rate": 0.0001754413782812812,
309
+ "loss": 1.6345,
310
  "step": 4300
311
  },
312
  {
313
+ "epoch": 1.2846715328467153,
314
+ "grad_norm": 1.7942287921905518,
315
+ "learning_rate": 0.00017418653071002047,
316
+ "loss": 1.6425,
317
  "step": 4400
318
  },
319
  {
320
+ "epoch": 1.313868613138686,
321
+ "grad_norm": 1.7287062406539917,
322
+ "learning_rate": 0.0001729051480646052,
323
+ "loss": 1.5037,
324
  "step": 4500
325
  },
326
  {
327
+ "epoch": 1.343065693430657,
328
+ "grad_norm": 1.340302586555481,
329
+ "learning_rate": 0.00017159768867059936,
330
+ "loss": 1.6512,
331
  "step": 4600
332
  },
333
  {
334
+ "epoch": 1.3722627737226278,
335
+ "grad_norm": 1.3054938316345215,
336
+ "learning_rate": 0.0001702646201807107,
337
+ "loss": 1.5379,
338
  "step": 4700
339
  },
340
  {
341
+ "epoch": 1.4014598540145986,
342
+ "grad_norm": 1.4920105934143066,
343
+ "learning_rate": 0.00016890641940752095,
344
+ "loss": 1.4965,
345
  "step": 4800
346
  },
347
  {
348
+ "epoch": 1.4306569343065694,
349
+ "grad_norm": 1.6876699924468994,
350
+ "learning_rate": 0.00016752357215293897,
351
+ "loss": 1.5866,
352
  "step": 4900
353
  },
354
  {
355
+ "epoch": 1.4598540145985401,
356
+ "grad_norm": 1.3976876735687256,
357
+ "learning_rate": 0.00016611657303443903,
358
+ "loss": 1.4894,
359
  "step": 5000
360
  },
361
  {
362
+ "epoch": 1.489051094890511,
363
+ "grad_norm": 1.5993022918701172,
364
+ "learning_rate": 0.0001646859253081458,
365
+ "loss": 1.5393,
366
  "step": 5100
367
  },
368
  {
369
+ "epoch": 1.5182481751824817,
370
+ "grad_norm": 1.5950144529342651,
371
+ "learning_rate": 0.00016323214068882935,
372
+ "loss": 1.4881,
373
  "step": 5200
374
  },
375
  {
376
+ "epoch": 1.5474452554744524,
377
+ "grad_norm": 1.4699268341064453,
378
+ "learning_rate": 0.00016175573916687484,
379
+ "loss": 1.5266,
380
  "step": 5300
381
  },
382
  {
383
+ "epoch": 1.5766423357664232,
384
+ "grad_norm": 1.494767427444458,
385
+ "learning_rate": 0.00016025724882229208,
386
+ "loss": 1.4496,
387
  "step": 5400
388
  },
389
  {
390
+ "epoch": 1.6058394160583942,
391
+ "grad_norm": 1.589040756225586,
392
+ "learning_rate": 0.00015873720563583165,
393
+ "loss": 1.4994,
394
  "step": 5500
395
  },
396
  {
397
+ "epoch": 1.635036496350365,
398
+ "grad_norm": 1.1471683979034424,
399
+ "learning_rate": 0.00015719615329727512,
400
+ "loss": 1.4464,
401
  "step": 5600
402
  },
403
  {
404
+ "epoch": 1.6642335766423357,
405
+ "grad_norm": 1.5427780151367188,
406
+ "learning_rate": 0.00015563464301096756,
407
+ "loss": 1.3936,
408
  "step": 5700
409
  },
410
  {
411
+ "epoch": 1.6934306569343067,
412
+ "grad_norm": 1.4828369617462158,
413
+ "learning_rate": 0.0001540532332986628,
414
+ "loss": 1.3653,
415
  "step": 5800
416
  },
417
  {
418
+ "epoch": 1.7226277372262775,
419
+ "grad_norm": 1.3401451110839844,
420
+ "learning_rate": 0.0001524524897997509,
421
+ "loss": 1.4141,
422
  "step": 5900
423
  },
424
  {
425
+ "epoch": 1.7518248175182483,
426
+ "grad_norm": 1.289553165435791,
427
+ "learning_rate": 0.00015083298506894015,
428
+ "loss": 1.3119,
429
  "step": 6000
430
  },
431
  {
432
+ "epoch": 1.781021897810219,
433
+ "grad_norm": 1.2549211978912354,
434
+ "learning_rate": 0.00014919529837146528,
435
+ "loss": 1.3384,
436
  "step": 6100
437
  },
438
  {
439
+ "epoch": 1.8102189781021898,
440
+ "grad_norm": 1.71195387840271,
441
+ "learning_rate": 0.00014754001547589564,
442
+ "loss": 1.3231,
443
  "step": 6200
444
  },
445
  {
446
+ "epoch": 1.8394160583941606,
447
+ "grad_norm": 1.4366215467453003,
448
+ "learning_rate": 0.0001458677284446172,
449
+ "loss": 1.3196,
450
  "step": 6300
451
  },
452
  {
453
+ "epoch": 1.8686131386861313,
454
+ "grad_norm": 1.5663456916809082,
455
+ "learning_rate": 0.00014417903542206342,
456
+ "loss": 1.4379,
457
  "step": 6400
458
  },
459
  {
460
+ "epoch": 1.897810218978102,
461
+ "grad_norm": 2.2416155338287354,
462
+ "learning_rate": 0.00014247454042077068,
463
+ "loss": 1.3796,
464
  "step": 6500
465
  },
466
  {
467
+ "epoch": 1.9270072992700729,
468
+ "grad_norm": 1.9517488479614258,
469
+ "learning_rate": 0.00014075485310533473,
470
+ "loss": 1.3097,
471
  "step": 6600
472
  },
473
  {
474
+ "epoch": 1.9562043795620438,
475
+ "grad_norm": 1.7964544296264648,
476
+ "learning_rate": 0.00013902058857434557,
477
+ "loss": 1.3401,
478
  "step": 6700
479
  },
480
  {
481
+ "epoch": 1.9854014598540146,
482
+ "grad_norm": 1.0145196914672852,
483
+ "learning_rate": 0.00013727236714037872,
484
+ "loss": 1.161,
485
  "step": 6800
486
  },
487
  {
488
+ "epoch": 2.0145985401459856,
489
+ "grad_norm": 3.1186952590942383,
490
+ "learning_rate": 0.00013551081410812147,
491
+ "loss": 1.1588,
492
  "step": 6900
493
  },
494
  {
495
+ "epoch": 2.0437956204379564,
496
+ "grad_norm": 2.3698039054870605,
497
+ "learning_rate": 0.0001337365595507137,
498
+ "loss": 1.0564,
499
  "step": 7000
500
  },
501
  {
502
+ "epoch": 2.072992700729927,
503
+ "grad_norm": 3.22995924949646,
504
+ "learning_rate": 0.0001319502380843829,
505
+ "loss": 1.1415,
506
  "step": 7100
507
  },
508
  {
509
+ "epoch": 2.102189781021898,
510
+ "grad_norm": 3.0756795406341553,
511
+ "learning_rate": 0.00013015248864145434,
512
+ "loss": 1.3046,
513
  "step": 7200
514
  },
515
  {
516
+ "epoch": 2.1313868613138687,
517
+ "grad_norm": 3.943307399749756,
518
+ "learning_rate": 0.00012834395424181748,
519
+ "loss": 1.1222,
520
  "step": 7300
521
  },
522
  {
523
+ "epoch": 2.1605839416058394,
524
+ "grad_norm": 2.424041271209717,
525
+ "learning_rate": 0.00012652528176293042,
526
+ "loss": 1.1787,
527
  "step": 7400
528
  },
529
  {
530
+ "epoch": 2.18978102189781,
531
+ "grad_norm": 2.5731918811798096,
532
+ "learning_rate": 0.0001246971217084443,
533
+ "loss": 1.1214,
534
  "step": 7500
535
  },
536
  {
537
+ "epoch": 2.218978102189781,
538
+ "grad_norm": 1.8488351106643677,
539
+ "learning_rate": 0.00012286012797553075,
540
+ "loss": 1.0242,
541
  "step": 7600
542
  },
543
  {
544
+ "epoch": 2.2481751824817517,
545
+ "grad_norm": 3.49588680267334,
546
+ "learning_rate": 0.0001210149576209959,
547
+ "loss": 1.1995,
548
  "step": 7700
549
  },
550
  {
551
+ "epoch": 2.2773722627737225,
552
+ "grad_norm": 1.9914380311965942,
553
+ "learning_rate": 0.00011916227062626388,
554
+ "loss": 1.2218,
555
  "step": 7800
556
  },
557
  {
558
+ "epoch": 2.3065693430656933,
559
+ "grad_norm": 2.816720485687256,
560
+ "learning_rate": 0.00011730272966131422,
561
+ "loss": 1.12,
562
  "step": 7900
563
  },
564
  {
565
+ "epoch": 2.335766423357664,
566
+ "grad_norm": 1.5543715953826904,
567
+ "learning_rate": 0.00011543699984765788,
568
+ "loss": 1.0535,
569
  "step": 8000
570
  },
571
  {
572
+ "epoch": 2.3649635036496353,
573
+ "grad_norm": 2.867494821548462,
574
+ "learning_rate": 0.00011356574852043617,
575
+ "loss": 1.1174,
576
  "step": 8100
577
  },
578
  {
579
+ "epoch": 2.394160583941606,
580
+ "grad_norm": 3.366215944290161,
581
+ "learning_rate": 0.00011168964498972818,
582
+ "loss": 1.3649,
583
  "step": 8200
584
  },
585
  {
586
+ "epoch": 2.423357664233577,
587
+ "grad_norm": 5.301421165466309,
588
+ "learning_rate": 0.00010980936030115132,
589
+ "loss": 1.0221,
590
  "step": 8300
591
  },
592
  {
593
+ "epoch": 2.4525547445255476,
594
+ "grad_norm": 2.44589900970459,
595
+ "learning_rate": 0.0001079255669958416,
596
+ "loss": 1.1887,
597
  "step": 8400
598
  },
599
  {
600
+ "epoch": 2.4817518248175183,
601
+ "grad_norm": 1.8434127569198608,
602
+ "learning_rate": 0.00010603893886989883,
603
+ "loss": 1.0379,
604
  "step": 8500
605
  },
606
  {
607
+ "epoch": 2.510948905109489,
608
+ "grad_norm": 5.0544257164001465,
609
+ "learning_rate": 0.00010415015073338286,
610
+ "loss": 1.0467,
611
  "step": 8600
612
  },
613
  {
614
+ "epoch": 2.54014598540146,
615
+ "grad_norm": 2.266094446182251,
616
+ "learning_rate": 0.00010225987816894698,
617
+ "loss": 1.0284,
618
  "step": 8700
619
  },
620
  {
621
+ "epoch": 2.5693430656934306,
622
+ "grad_norm": 4.524600028991699,
623
+ "learning_rate": 0.00010036879729019559,
624
+ "loss": 1.0921,
625
  "step": 8800
626
  },
627
  {
628
+ "epoch": 2.5985401459854014,
629
+ "grad_norm": 3.9681153297424316,
630
+ "learning_rate": 9.847758449985124e-05,
631
+ "loss": 1.0259,
632
  "step": 8900
633
  },
634
  {
635
+ "epoch": 2.627737226277372,
636
+ "grad_norm": 4.628920078277588,
637
+ "learning_rate": 9.658691624781866e-05,
638
+ "loss": 1.1767,
639
  "step": 9000
640
  },
641
  {
642
+ "epoch": 2.656934306569343,
643
+ "grad_norm": 3.457036256790161,
644
+ "learning_rate": 9.469746878923188e-05,
645
+ "loss": 1.0662,
646
  "step": 9100
647
  },
648
  {
649
+ "epoch": 2.686131386861314,
650
+ "grad_norm": 3.112386703491211,
651
+ "learning_rate": 9.280991794257103e-05,
652
+ "loss": 1.0129,
653
  "step": 9200
654
  },
655
  {
656
+ "epoch": 2.7153284671532845,
657
+ "grad_norm": 2.2668728828430176,
658
+ "learning_rate": 9.092493884793501e-05,
659
+ "loss": 1.0139,
660
  "step": 9300
661
  },
662
  {
663
+ "epoch": 2.7445255474452557,
664
+ "grad_norm": 3.580484390258789,
665
+ "learning_rate": 8.904320572555734e-05,
666
+ "loss": 1.0144,
667
  "step": 9400
668
  },
669
  {
670
+ "epoch": 2.7737226277372264,
671
+ "grad_norm": 1.5489946603775024,
672
+ "learning_rate": 8.71653916346505e-05,
673
+ "loss": 0.9979,
674
  "step": 9500
675
  },
676
  {
677
+ "epoch": 2.802919708029197,
678
+ "grad_norm": 1.7365363836288452,
679
+ "learning_rate": 8.529216823266606e-05,
680
+ "loss": 1.0874,
681
  "step": 9600
682
  },
683
  {
684
+ "epoch": 2.832116788321168,
685
+ "grad_norm": 3.3894410133361816,
686
+ "learning_rate": 8.342420553505559e-05,
687
+ "loss": 1.1251,
688
  "step": 9700
689
  },
690
  {
691
+ "epoch": 2.8613138686131387,
692
+ "grad_norm": 4.5942912101745605,
693
+ "learning_rate": 8.15621716756195e-05,
694
+ "loss": 1.1814,
695
  "step": 9800
696
  },
697
  {
698
+ "epoch": 2.8905109489051095,
699
+ "grad_norm": 1.3230512142181396,
700
+ "learning_rate": 7.970673266752838e-05,
701
+ "loss": 1.1201,
702
  "step": 9900
703
  },
704
  {
705
+ "epoch": 2.9197080291970803,
706
+ "grad_norm": 1.1530183553695679,
707
+ "learning_rate": 7.785855216510337e-05,
708
+ "loss": 1.0758,
709
  "step": 10000
710
  },
711
  {
712
+ "epoch": 2.948905109489051,
713
+ "grad_norm": 2.9893460273742676,
714
+ "learning_rate": 7.601829122643957e-05,
715
+ "loss": 1.02,
716
  "step": 10100
717
  },
718
  {
719
+ "epoch": 2.978102189781022,
720
+ "grad_norm": 2.8600292205810547,
721
+ "learning_rate": 7.418660807695897e-05,
722
+ "loss": 1.1625,
723
  "step": 10200
724
  },
725
  {
726
+ "epoch": 3.0072992700729926,
727
+ "grad_norm": 0.6397629976272583,
728
+ "learning_rate": 7.236415787397548e-05,
729
+ "loss": 1.2091,
730
  "step": 10300
731
  },
732
  {
733
+ "epoch": 3.0364963503649633,
734
+ "grad_norm": 0.5129945278167725,
735
+ "learning_rate": 7.055159247235844e-05,
736
+ "loss": 1.1668,
737
  "step": 10400
738
  },
739
  {
740
+ "epoch": 3.065693430656934,
741
+ "grad_norm": 0.49799928069114685,
742
+ "learning_rate": 6.874956019137669e-05,
743
+ "loss": 0.8265,
744
  "step": 10500
745
  },
746
  {
747
+ "epoch": 3.094890510948905,
748
+ "grad_norm": 0.8108125925064087,
749
+ "learning_rate": 6.695870558280718e-05,
750
+ "loss": 1.0216,
751
  "step": 10600
752
  },
753
  {
754
+ "epoch": 3.124087591240876,
755
+ "grad_norm": 0.45047125220298767,
756
+ "learning_rate": 6.51796692003918e-05,
757
+ "loss": 1.1076,
758
  "step": 10700
759
  },
760
  {
761
+ "epoch": 3.153284671532847,
762
+ "grad_norm": 0.700933039188385,
763
+ "learning_rate": 6.341308737072349e-05,
764
+ "loss": 0.9756,
765
  "step": 10800
766
  },
767
  {
768
+ "epoch": 3.1824817518248176,
769
+ "grad_norm": 0.5597018003463745,
770
+ "learning_rate": 6.165959196564481e-05,
771
+ "loss": 0.854,
772
  "step": 10900
773
  },
774
  {
775
+ "epoch": 3.2116788321167884,
776
+ "grad_norm": 0.5288468599319458,
777
+ "learning_rate": 5.991981017623955e-05,
778
+ "loss": 0.9882,
779
  "step": 11000
780
  },
781
  {
782
+ "epoch": 3.240875912408759,
783
+ "grad_norm": 0.7121827602386475,
784
+ "learning_rate": 5.819436428849896e-05,
785
+ "loss": 1.0701,
786
  "step": 11100
787
  },
788
  {
789
+ "epoch": 3.27007299270073,
790
+ "grad_norm": 0.7537636756896973,
791
+ "learning_rate": 5.648387146074192e-05,
792
+ "loss": 0.9367,
793
  "step": 11200
794
  },
795
  {
796
+ "epoch": 3.2992700729927007,
797
+ "grad_norm": 0.33666905760765076,
798
+ "learning_rate": 5.478894350286965e-05,
799
+ "loss": 1.0724,
800
  "step": 11300
801
  },
802
  {
803
+ "epoch": 3.3284671532846715,
804
+ "grad_norm": 0.4632696509361267,
805
+ "learning_rate": 5.311018665753318e-05,
806
+ "loss": 1.0971,
807
  "step": 11400
808
  },
809
  {
810
+ "epoch": 3.3576642335766422,
811
+ "grad_norm": 0.5160499811172485,
812
+ "learning_rate": 5.144820138329223e-05,
813
+ "loss": 1.0371,
814
  "step": 11500
815
  },
816
  {
817
+ "epoch": 3.386861313868613,
818
+ "grad_norm": 0.5635733008384705,
819
+ "learning_rate": 4.980358213984282e-05,
820
+ "loss": 0.978,
821
  "step": 11600
822
  },
823
  {
824
+ "epoch": 3.4160583941605838,
825
+ "grad_norm": 0.6345902681350708,
826
+ "learning_rate": 4.8176917175390656e-05,
827
+ "loss": 0.9646,
828
  "step": 11700
829
  },
830
  {
831
+ "epoch": 3.445255474452555,
832
+ "grad_norm": 0.6311865448951721,
833
+ "learning_rate": 4.656878831624636e-05,
834
+ "loss": 0.9127,
835
  "step": 11800
836
  },
837
  {
838
+ "epoch": 3.4744525547445253,
839
+ "grad_norm": 0.4736361503601074,
840
+ "learning_rate": 4.497977075871738e-05,
841
+ "loss": 1.2787,
842
  "step": 11900
843
  },
844
  {
845
+ "epoch": 3.5036496350364965,
846
+ "grad_norm": 0.3800007402896881,
847
+ "learning_rate": 4.341043286337153e-05,
848
+ "loss": 0.9448,
849
  "step": 12000
850
  },
851
  {
852
+ "epoch": 3.5328467153284673,
853
+ "grad_norm": 0.4484248161315918,
854
+ "learning_rate": 4.1861335951745594e-05,
855
+ "loss": 0.9813,
856
  "step": 12100
857
  },
858
  {
859
+ "epoch": 3.562043795620438,
860
+ "grad_norm": 0.4944378733634949,
861
+ "learning_rate": 4.0333034105571565e-05,
862
+ "loss": 1.1448,
863
  "step": 12200
864
  },
865
  {
866
+ "epoch": 3.591240875912409,
867
+ "grad_norm": 0.42330047488212585,
868
+ "learning_rate": 3.882607396859229e-05,
869
+ "loss": 1.0001,
870
  "step": 12300
871
  },
872
  {
873
+ "epoch": 3.6204379562043796,
874
+ "grad_norm": 0.492709219455719,
875
+ "learning_rate": 3.734099455103779e-05,
876
+ "loss": 1.1352,
877
  "step": 12400
878
  },
879
  {
880
+ "epoch": 3.6496350364963503,
881
+ "grad_norm": 0.42140600085258484,
882
+ "learning_rate": 3.587832703683175e-05,
883
+ "loss": 1.0103,
884
  "step": 12500
885
  },
886
  {
887
+ "epoch": 3.678832116788321,
888
+ "grad_norm": 0.393767774105072,
889
+ "learning_rate": 3.4438594593597596e-05,
890
+ "loss": 0.9709,
891
  "step": 12600
892
  },
893
  {
894
+ "epoch": 3.708029197080292,
895
+ "grad_norm": 0.5599704384803772,
896
+ "learning_rate": 3.3022312185531214e-05,
897
+ "loss": 0.9962,
898
  "step": 12700
899
  },
900
  {
901
+ "epoch": 3.7372262773722627,
902
+ "grad_norm": 0.3712032735347748,
903
+ "learning_rate": 3.16299863892088e-05,
904
+ "loss": 1.1135,
905
  "step": 12800
906
  },
907
  {
908
+ "epoch": 3.7664233576642334,
909
+ "grad_norm": 0.3675720989704132,
910
+ "learning_rate": 3.026211521239408e-05,
911
+ "loss": 0.9385,
912
  "step": 12900
913
  },
914
  {
915
+ "epoch": 3.795620437956204,
916
+ "grad_norm": 0.3175857961177826,
917
+ "learning_rate": 2.891918791591046e-05,
918
+ "loss": 1.0536,
919
  "step": 13000
920
  },
921
  {
922
+ "epoch": 3.8248175182481754,
923
+ "grad_norm": 0.42325085401535034,
924
+ "learning_rate": 2.7601684838642405e-05,
925
+ "loss": 1.0483,
926
  "step": 13100
927
  },
928
  {
929
+ "epoch": 3.8540145985401457,
930
+ "grad_norm": 0.41798830032348633,
931
+ "learning_rate": 2.6310077225727224e-05,
932
+ "loss": 1.068,
933
  "step": 13200
934
  },
935
  {
936
+ "epoch": 3.883211678832117,
937
+ "grad_norm": 0.4633995294570923,
938
+ "learning_rate": 2.5044827060000085e-05,
939
+ "loss": 1.0305,
940
  "step": 13300
941
  },
942
  {
943
+ "epoch": 3.9124087591240877,
944
+ "grad_norm": 0.38265103101730347,
945
+ "learning_rate": 2.380638689675164e-05,
946
+ "loss": 0.9594,
947
  "step": 13400
948
  },
949
  {
950
+ "epoch": 3.9416058394160585,
951
+ "grad_norm": 0.3224177062511444,
952
+ "learning_rate": 2.2595199701858026e-05,
953
+ "loss": 0.9919,
954
  "step": 13500
955
  },
956
  {
957
+ "epoch": 3.9708029197080292,
958
+ "grad_norm": 0.44377565383911133,
959
+ "learning_rate": 2.1411698693340355e-05,
960
+ "loss": 1.0119,
961
  "step": 13600
962
  },
963
  {
964
+ "epoch": 4.0,
965
+ "grad_norm": 2.024462938308716,
966
+ "learning_rate": 2.0256307186411295e-05,
967
+ "loss": 0.8573,
968
  "step": 13700
969
  },
970
  {
971
+ "epoch": 4.029197080291971,
972
+ "grad_norm": 2.932596206665039,
973
+ "learning_rate": 1.912943844206333e-05,
974
+ "loss": 0.9969,
975
  "step": 13800
976
  },
977
  {
978
+ "epoch": 4.0583941605839415,
979
+ "grad_norm": 2.8395912647247314,
980
+ "learning_rate": 1.803149551925356e-05,
981
+ "loss": 0.8543,
982
  "step": 13900
983
  },
984
  {
985
+ "epoch": 4.087591240875913,
986
+ "grad_norm": 3.7360405921936035,
987
+ "learning_rate": 1.6962871130737168e-05,
988
+ "loss": 0.9961,
989
  "step": 14000
990
  },
991
  {
992
+ "epoch": 4.116788321167883,
993
+ "grad_norm": 1.6590383052825928,
994
+ "learning_rate": 1.59239475026018e-05,
995
+ "loss": 0.839,
996
  "step": 14100
997
  },
998
  {
999
+ "epoch": 4.145985401459854,
1000
+ "grad_norm": 2.4709715843200684,
1001
+ "learning_rate": 1.4915096237552873e-05,
1002
+ "loss": 1.0494,
1003
  "step": 14200
1004
  },
1005
  {
1006
+ "epoch": 4.175182481751825,
1007
+ "grad_norm": 3.2636585235595703,
1008
+ "learning_rate": 1.3936678181998374e-05,
1009
+ "loss": 1.0201,
1010
  "step": 14300
1011
  },
1012
  {
1013
+ "epoch": 4.204379562043796,
1014
+ "grad_norm": 1.440502405166626,
1015
+ "learning_rate": 1.298904329698123e-05,
1016
+ "loss": 1.0966,
1017
  "step": 14400
1018
  },
1019
  {
1020
+ "epoch": 4.233576642335766,
1021
+ "grad_norm": 2.1606736183166504,
1022
+ "learning_rate": 1.2072530533005012e-05,
1023
+ "loss": 1.0813,
1024
  "step": 14500
1025
  },
1026
  {
1027
+ "epoch": 4.262773722627737,
1028
+ "grad_norm": 1.3628579378128052,
1029
+ "learning_rate": 1.1187467708798116e-05,
1030
+ "loss": 1.0362,
1031
  "step": 14600
1032
  },
1033
  {
1034
+ "epoch": 4.291970802919708,
1035
+ "grad_norm": 2.560506820678711,
1036
+ "learning_rate": 1.0334171394059122e-05,
1037
+ "loss": 1.0291,
1038
  "step": 14700
1039
  },
1040
  {
1041
+ "epoch": 4.321167883211679,
1042
+ "grad_norm": 1.9132288694381714,
1043
+ "learning_rate": 9.512946796226296e-06,
1044
+ "loss": 1.0084,
1045
  "step": 14800
1046
  },
1047
  {
1048
+ "epoch": 4.350364963503649,
1049
+ "grad_norm": 2.191732406616211,
1050
+ "learning_rate": 8.724087651310609e-06,
1051
+ "loss": 0.9277,
1052
  "step": 14900
1053
  },
1054
  {
1055
+ "epoch": 4.37956204379562,
1056
+ "grad_norm": 1.6962709426879883,
1057
+ "learning_rate": 7.967876118832229e-06,
1058
+ "loss": 0.9741,
1059
  "step": 15000
1060
  },
1061
  {
1062
+ "epoch": 4.408759124087592,
1063
+ "grad_norm": 1.9875869750976562,
1064
+ "learning_rate": 7.244582680897527e-06,
1065
+ "loss": 0.9878,
1066
  "step": 15100
1067
  },
1068
  {
1069
+ "epoch": 4.437956204379562,
1070
+ "grad_norm": 3.515268087387085,
1071
+ "learning_rate": 6.554466045452923e-06,
1072
+ "loss": 1.0505,
1073
  "step": 15200
1074
  },
1075
  {
1076
+ "epoch": 4.467153284671533,
1077
+ "grad_norm": 3.007796287536621,
1078
+ "learning_rate": 5.897773053750066e-06,
1079
+ "loss": 1.0459,
1080
  "step": 15300
1081
  },
1082
  {
1083
+ "epoch": 4.4963503649635035,
1084
+ "grad_norm": 1.8410799503326416,
1085
+ "learning_rate": 5.274738592055573e-06,
1086
+ "loss": 0.9201,
1087
  "step": 15400
1088
  },
1089
  {
1090
+ "epoch": 4.525547445255475,
1091
+ "grad_norm": 2.346604824066162,
1092
+ "learning_rate": 4.6855855076367804e-06,
1093
+ "loss": 1.0322,
1094
  "step": 15500
1095
  },
1096
  {
1097
+ "epoch": 4.554744525547445,
1098
+ "grad_norm": 2.057304859161377,
1099
+ "learning_rate": 4.130524529053626e-06,
1100
+ "loss": 1.001,
1101
  "step": 15600
1102
  },
1103
  {
1104
+ "epoch": 4.583941605839416,
1105
+ "grad_norm": 4.114916801452637,
1106
+ "learning_rate": 3.609754190785164e-06,
1107
+ "loss": 0.9103,
1108
  "step": 15700
1109
  },
1110
  {
1111
+ "epoch": 4.613138686131387,
1112
+ "grad_norm": 3.7978780269622803,
1113
+ "learning_rate": 3.1234607622176227e-06,
1114
+ "loss": 1.052,
1115
  "step": 15800
1116
  },
1117
  {
1118
+ "epoch": 4.642335766423358,
1119
+ "grad_norm": 1.945447564125061,
1120
+ "learning_rate": 2.6718181810195696e-06,
1121
+ "loss": 0.9763,
1122
  "step": 15900
1123
  },
1124
  {
1125
+ "epoch": 4.671532846715328,
1126
+ "grad_norm": 2.225212812423706,
1127
+ "learning_rate": 2.2549879909276593e-06,
1128
+ "loss": 1.0564,
1129
  "step": 16000
1130
  },
1131
  {
1132
+ "epoch": 4.700729927007299,
1133
+ "grad_norm": 1.6726568937301636,
1134
+ "learning_rate": 1.8731192839657407e-06,
1135
+ "loss": 1.042,
1136
  "step": 16100
1137
  },
1138
  {
1139
+ "epoch": 4.7299270072992705,
1140
+ "grad_norm": 1.6171780824661255,
1141
+ "learning_rate": 1.5263486471174482e-06,
1142
+ "loss": 0.8253,
1143
  "step": 16200
1144
  },
1145
  {
1146
+ "epoch": 4.759124087591241,
1147
+ "grad_norm": 3.288196086883545,
1148
+ "learning_rate": 1.2148001134717369e-06,
1149
+ "loss": 1.0111,
1150
  "step": 16300
1151
  },
1152
  {
1153
+ "epoch": 4.788321167883212,
1154
+ "grad_norm": 3.768450975418091,
1155
+ "learning_rate": 9.385851178586924e-07,
1156
+ "loss": 0.8975,
1157
  "step": 16400
1158
  },
1159
  {
1160
+ "epoch": 4.817518248175182,
1161
+ "grad_norm": 3.030186176300049,
1162
+ "learning_rate": 6.978024569914032e-07,
1163
+ "loss": 1.0036,
1164
  "step": 16500
1165
  },
1166
  {
1167
+ "epoch": 4.846715328467154,
1168
+ "grad_norm": 0.843503475189209,
1169
+ "learning_rate": 4.92538254128383e-07,
1170
+ "loss": 1.011,
1171
  "step": 16600
1172
  },
1173
  {
1174
+ "epoch": 4.875912408759124,
1175
+ "grad_norm": 2.515411376953125,
1176
+ "learning_rate": 3.2286592826888953e-07,
1177
+ "loss": 1.2158,
1178
  "step": 16700
1179
  },
1180
  {
1181
+ "epoch": 4.905109489051095,
1182
+ "grad_norm": 1.0402921438217163,
1183
+ "learning_rate": 1.8884616789244248e-07,
1184
+ "loss": 0.9555,
1185
  "step": 16800
1186
  },
1187
  {
1188
+ "epoch": 4.934306569343065,
1189
+ "grad_norm": 0.927361249923706,
1190
+ "learning_rate": 9.052690925168695e-08,
1191
+ "loss": 1.0264,
1192
  "step": 16900
1193
  },
1194
  {
1195
+ "epoch": 4.963503649635037,
1196
+ "grad_norm": 1.8087859153747559,
1197
+ "learning_rate": 2.7943319226564346e-08,
1198
+ "loss": 1.0197,
1199
  "step": 17000
1200
  },
1201
  {
1202
+ "epoch": 4.992700729927007,
1203
+ "grad_norm": 2.9732730388641357,
1204
+ "learning_rate": 1.1177827458075386e-09,
1205
+ "loss": 1.0122,
1206
  "step": 17100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1207
  }
1208
  ],
1209
  "logging_steps": 100,
1210
+ "max_steps": 17125,
1211
  "num_input_tokens_seen": 0,
1212
+ "num_train_epochs": 5,
1213
  "save_steps": 0,
1214
  "stateful_callbacks": {
1215
  "TrainerControl": {
 
1223
  "attributes": {}
1224
  }
1225
  },
1226
+ "total_flos": 1.2724762270100429e+17,
1227
  "train_batch_size": 8,
1228
  "trial_name": null,
1229
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0251723f961cec3d43bfad005955168aa7a10cb8822875228569bb1d18b0ace
3
- size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5abbcc02e7630f5ef822b6e6a8955573defc6c815565874ce5885a2c0ef54915
3
+ size 5560