sft_medico-mistral / trainer_state.json
Boyue27's picture
Upload trainer_state.json with huggingface_hub
95d111f verified
raw
history blame
73.8 kB
{
"best_metric": 0.9312900900840759,
"best_model_checkpoint": "/proj/berzelius-2023-338/users/x_boyji/model_hf/sft_medico-mixtral/checkpoint-1000",
"epoch": 2.9991683991683993,
"eval_steps": 500,
"global_step": 1803,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 6.000000000000001e-07,
"loss": 2.0272,
"step": 3
},
{
"epoch": 0.01,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.921,
"step": 6
},
{
"epoch": 0.01,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.6681,
"step": 9
},
{
"epoch": 0.02,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.5087,
"step": 12
},
{
"epoch": 0.02,
"learning_rate": 3e-06,
"loss": 1.4299,
"step": 15
},
{
"epoch": 0.03,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3476,
"step": 18
},
{
"epoch": 0.03,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3558,
"step": 21
},
{
"epoch": 0.04,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3113,
"step": 24
},
{
"epoch": 0.04,
"learning_rate": 5.400000000000001e-06,
"loss": 1.3135,
"step": 27
},
{
"epoch": 0.05,
"learning_rate": 6e-06,
"loss": 1.3068,
"step": 30
},
{
"epoch": 0.05,
"learning_rate": 6.600000000000001e-06,
"loss": 1.2869,
"step": 33
},
{
"epoch": 0.06,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.2698,
"step": 36
},
{
"epoch": 0.06,
"learning_rate": 7.800000000000002e-06,
"loss": 1.2347,
"step": 39
},
{
"epoch": 0.07,
"learning_rate": 8.400000000000001e-06,
"loss": 1.2687,
"step": 42
},
{
"epoch": 0.07,
"learning_rate": 9e-06,
"loss": 1.2221,
"step": 45
},
{
"epoch": 0.08,
"learning_rate": 9.600000000000001e-06,
"loss": 1.2231,
"step": 48
},
{
"epoch": 0.08,
"learning_rate": 1.02e-05,
"loss": 1.1976,
"step": 51
},
{
"epoch": 0.09,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.1833,
"step": 54
},
{
"epoch": 0.09,
"learning_rate": 1.14e-05,
"loss": 1.1798,
"step": 57
},
{
"epoch": 0.1,
"learning_rate": 1.2e-05,
"loss": 1.1878,
"step": 60
},
{
"epoch": 0.1,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.1788,
"step": 63
},
{
"epoch": 0.11,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.1948,
"step": 66
},
{
"epoch": 0.11,
"learning_rate": 1.38e-05,
"loss": 1.1836,
"step": 69
},
{
"epoch": 0.12,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.1756,
"step": 72
},
{
"epoch": 0.12,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.1868,
"step": 75
},
{
"epoch": 0.13,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.1652,
"step": 78
},
{
"epoch": 0.13,
"learning_rate": 1.62e-05,
"loss": 1.1688,
"step": 81
},
{
"epoch": 0.14,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.1783,
"step": 84
},
{
"epoch": 0.14,
"learning_rate": 1.7400000000000003e-05,
"loss": 1.1799,
"step": 87
},
{
"epoch": 0.15,
"learning_rate": 1.8e-05,
"loss": 1.3097,
"step": 90
},
{
"epoch": 0.15,
"learning_rate": 1.86e-05,
"loss": 1.2879,
"step": 93
},
{
"epoch": 0.16,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.35,
"step": 96
},
{
"epoch": 0.16,
"learning_rate": 1.98e-05,
"loss": 1.2501,
"step": 99
},
{
"epoch": 0.17,
"learning_rate": 1.9999931938745066e-05,
"loss": 1.1856,
"step": 102
},
{
"epoch": 0.17,
"learning_rate": 1.999957461968997e-05,
"loss": 1.1927,
"step": 105
},
{
"epoch": 0.18,
"learning_rate": 1.9998911038450304e-05,
"loss": 1.2005,
"step": 108
},
{
"epoch": 0.18,
"learning_rate": 1.999794121534991e-05,
"loss": 1.1375,
"step": 111
},
{
"epoch": 0.19,
"learning_rate": 1.9996665180092068e-05,
"loss": 1.1645,
"step": 114
},
{
"epoch": 0.19,
"learning_rate": 1.9995082971758574e-05,
"loss": 1.1381,
"step": 117
},
{
"epoch": 0.2,
"learning_rate": 1.999319463880855e-05,
"loss": 1.189,
"step": 120
},
{
"epoch": 0.2,
"learning_rate": 1.9991000239076952e-05,
"loss": 1.1539,
"step": 123
},
{
"epoch": 0.21,
"learning_rate": 1.9988499839772805e-05,
"loss": 1.1454,
"step": 126
},
{
"epoch": 0.21,
"learning_rate": 1.9985693517477142e-05,
"loss": 1.1611,
"step": 129
},
{
"epoch": 0.22,
"learning_rate": 1.9982581358140657e-05,
"loss": 1.162,
"step": 132
},
{
"epoch": 0.22,
"learning_rate": 1.997916345708108e-05,
"loss": 1.1244,
"step": 135
},
{
"epoch": 0.23,
"learning_rate": 1.9975439918980246e-05,
"loss": 1.1738,
"step": 138
},
{
"epoch": 0.23,
"learning_rate": 1.99714108578809e-05,
"loss": 1.1746,
"step": 141
},
{
"epoch": 0.24,
"learning_rate": 1.9967076397183187e-05,
"loss": 1.1332,
"step": 144
},
{
"epoch": 0.24,
"learning_rate": 1.9962436669640906e-05,
"loss": 1.145,
"step": 147
},
{
"epoch": 0.25,
"learning_rate": 1.9957491817357407e-05,
"loss": 1.1246,
"step": 150
},
{
"epoch": 0.25,
"learning_rate": 1.995224199178126e-05,
"loss": 1.1561,
"step": 153
},
{
"epoch": 0.26,
"learning_rate": 1.9946687353701607e-05,
"loss": 1.1436,
"step": 156
},
{
"epoch": 0.26,
"learning_rate": 1.9940828073243244e-05,
"loss": 1.1285,
"step": 159
},
{
"epoch": 0.27,
"learning_rate": 1.9934664329861412e-05,
"loss": 1.1578,
"step": 162
},
{
"epoch": 0.27,
"learning_rate": 1.9928196312336287e-05,
"loss": 1.141,
"step": 165
},
{
"epoch": 0.28,
"learning_rate": 1.9921424218767215e-05,
"loss": 1.1133,
"step": 168
},
{
"epoch": 0.28,
"learning_rate": 1.991434825656664e-05,
"loss": 1.114,
"step": 171
},
{
"epoch": 0.29,
"learning_rate": 1.9906968642453745e-05,
"loss": 1.1049,
"step": 174
},
{
"epoch": 0.29,
"learning_rate": 1.989928560244781e-05,
"loss": 1.1417,
"step": 177
},
{
"epoch": 0.3,
"learning_rate": 1.9891299371861312e-05,
"loss": 1.1442,
"step": 180
},
{
"epoch": 0.3,
"learning_rate": 1.9883010195292702e-05,
"loss": 1.1164,
"step": 183
},
{
"epoch": 0.31,
"learning_rate": 1.9874418326618906e-05,
"loss": 1.1319,
"step": 186
},
{
"epoch": 0.31,
"learning_rate": 1.9865524028987565e-05,
"loss": 1.1265,
"step": 189
},
{
"epoch": 0.32,
"learning_rate": 1.9856327574808974e-05,
"loss": 1.1289,
"step": 192
},
{
"epoch": 0.32,
"learning_rate": 1.9846829245747725e-05,
"loss": 1.1381,
"step": 195
},
{
"epoch": 0.33,
"learning_rate": 1.9837029332714094e-05,
"loss": 1.13,
"step": 198
},
{
"epoch": 0.33,
"learning_rate": 1.9826928135855128e-05,
"loss": 1.134,
"step": 201
},
{
"epoch": 0.34,
"learning_rate": 1.9816525964545447e-05,
"loss": 1.1173,
"step": 204
},
{
"epoch": 0.34,
"learning_rate": 1.9805823137377776e-05,
"loss": 1.1364,
"step": 207
},
{
"epoch": 0.35,
"learning_rate": 1.979481998215318e-05,
"loss": 1.1241,
"step": 210
},
{
"epoch": 0.35,
"learning_rate": 1.978351683587103e-05,
"loss": 1.134,
"step": 213
},
{
"epoch": 0.36,
"learning_rate": 1.9771914044718682e-05,
"loss": 1.1064,
"step": 216
},
{
"epoch": 0.36,
"learning_rate": 1.9760011964060863e-05,
"loss": 1.1324,
"step": 219
},
{
"epoch": 0.37,
"learning_rate": 1.974781095842881e-05,
"loss": 1.1239,
"step": 222
},
{
"epoch": 0.37,
"learning_rate": 1.9735311401509078e-05,
"loss": 1.1348,
"step": 225
},
{
"epoch": 0.38,
"learning_rate": 1.9722513676132112e-05,
"loss": 1.1425,
"step": 228
},
{
"epoch": 0.38,
"learning_rate": 1.9709418174260523e-05,
"loss": 1.1224,
"step": 231
},
{
"epoch": 0.39,
"learning_rate": 1.9696025296977067e-05,
"loss": 1.0922,
"step": 234
},
{
"epoch": 0.39,
"learning_rate": 1.9682335454472382e-05,
"loss": 1.1123,
"step": 237
},
{
"epoch": 0.4,
"learning_rate": 1.9668349066032412e-05,
"loss": 1.0966,
"step": 240
},
{
"epoch": 0.4,
"learning_rate": 1.9654066560025566e-05,
"loss": 1.119,
"step": 243
},
{
"epoch": 0.41,
"learning_rate": 1.9639488373889597e-05,
"loss": 1.0947,
"step": 246
},
{
"epoch": 0.41,
"learning_rate": 1.9624614954118214e-05,
"loss": 1.0871,
"step": 249
},
{
"epoch": 0.42,
"learning_rate": 1.9609446756247402e-05,
"loss": 1.1088,
"step": 252
},
{
"epoch": 0.42,
"learning_rate": 1.9593984244841455e-05,
"loss": 1.0891,
"step": 255
},
{
"epoch": 0.43,
"learning_rate": 1.957822789347878e-05,
"loss": 1.1095,
"step": 258
},
{
"epoch": 0.43,
"learning_rate": 1.9562178184737358e-05,
"loss": 1.1145,
"step": 261
},
{
"epoch": 0.44,
"learning_rate": 1.9545835610179985e-05,
"loss": 1.1324,
"step": 264
},
{
"epoch": 0.44,
"learning_rate": 1.952920067033921e-05,
"loss": 1.1047,
"step": 267
},
{
"epoch": 0.45,
"learning_rate": 1.9512273874702016e-05,
"loss": 1.1257,
"step": 270
},
{
"epoch": 0.45,
"learning_rate": 1.9495055741694188e-05,
"loss": 1.1089,
"step": 273
},
{
"epoch": 0.46,
"learning_rate": 1.9477546798664463e-05,
"loss": 1.0767,
"step": 276
},
{
"epoch": 0.46,
"learning_rate": 1.9459747581868366e-05,
"loss": 1.0819,
"step": 279
},
{
"epoch": 0.47,
"learning_rate": 1.9441658636451794e-05,
"loss": 1.0888,
"step": 282
},
{
"epoch": 0.47,
"learning_rate": 1.94232805164343e-05,
"loss": 1.1043,
"step": 285
},
{
"epoch": 0.48,
"learning_rate": 1.9404613784692152e-05,
"loss": 1.0753,
"step": 288
},
{
"epoch": 0.48,
"learning_rate": 1.9385659012941072e-05,
"loss": 1.0885,
"step": 291
},
{
"epoch": 0.49,
"learning_rate": 1.9366416781718733e-05,
"loss": 1.08,
"step": 294
},
{
"epoch": 0.49,
"learning_rate": 1.9346887680366984e-05,
"loss": 1.0894,
"step": 297
},
{
"epoch": 0.5,
"learning_rate": 1.9327072307013794e-05,
"loss": 1.0798,
"step": 300
},
{
"epoch": 0.5,
"learning_rate": 1.9306971268554927e-05,
"loss": 1.0543,
"step": 303
},
{
"epoch": 0.51,
"learning_rate": 1.9286585180635375e-05,
"loss": 1.0734,
"step": 306
},
{
"epoch": 0.51,
"learning_rate": 1.926591466763047e-05,
"loss": 1.0822,
"step": 309
},
{
"epoch": 0.52,
"learning_rate": 1.9244960362626795e-05,
"loss": 1.0944,
"step": 312
},
{
"epoch": 0.52,
"learning_rate": 1.922372290740277e-05,
"loss": 1.0789,
"step": 315
},
{
"epoch": 0.53,
"learning_rate": 1.9202202952409012e-05,
"loss": 1.0973,
"step": 318
},
{
"epoch": 0.53,
"learning_rate": 1.9180401156748397e-05,
"loss": 1.0952,
"step": 321
},
{
"epoch": 0.54,
"learning_rate": 1.9158318188155882e-05,
"loss": 1.0775,
"step": 324
},
{
"epoch": 0.54,
"learning_rate": 1.9135954722978062e-05,
"loss": 1.0793,
"step": 327
},
{
"epoch": 0.55,
"learning_rate": 1.9113311446152445e-05,
"loss": 1.0439,
"step": 330
},
{
"epoch": 0.55,
"learning_rate": 1.909038905118647e-05,
"loss": 1.0889,
"step": 333
},
{
"epoch": 0.56,
"learning_rate": 1.906718824013628e-05,
"loss": 1.0371,
"step": 336
},
{
"epoch": 0.56,
"learning_rate": 1.9043709723585207e-05,
"loss": 1.093,
"step": 339
},
{
"epoch": 0.57,
"learning_rate": 1.9019954220622018e-05,
"loss": 1.0692,
"step": 342
},
{
"epoch": 0.57,
"learning_rate": 1.8995922458818885e-05,
"loss": 1.0658,
"step": 345
},
{
"epoch": 0.58,
"learning_rate": 1.8971615174209104e-05,
"loss": 1.0633,
"step": 348
},
{
"epoch": 0.58,
"learning_rate": 1.8947033111264558e-05,
"loss": 1.0732,
"step": 351
},
{
"epoch": 0.59,
"learning_rate": 1.892217702287289e-05,
"loss": 1.0536,
"step": 354
},
{
"epoch": 0.59,
"learning_rate": 1.889704767031449e-05,
"loss": 1.0752,
"step": 357
},
{
"epoch": 0.6,
"learning_rate": 1.8871645823239128e-05,
"loss": 1.0933,
"step": 360
},
{
"epoch": 0.6,
"learning_rate": 1.8845972259642424e-05,
"loss": 1.0703,
"step": 363
},
{
"epoch": 0.61,
"learning_rate": 1.8820027765841993e-05,
"loss": 1.0722,
"step": 366
},
{
"epoch": 0.61,
"learning_rate": 1.8793813136453364e-05,
"loss": 1.0367,
"step": 369
},
{
"epoch": 0.62,
"learning_rate": 1.8767329174365664e-05,
"loss": 1.0361,
"step": 372
},
{
"epoch": 0.62,
"learning_rate": 1.8740576690717004e-05,
"loss": 1.0454,
"step": 375
},
{
"epoch": 0.63,
"learning_rate": 1.8713556504869644e-05,
"loss": 1.0725,
"step": 378
},
{
"epoch": 0.63,
"learning_rate": 1.8686269444384905e-05,
"loss": 1.1055,
"step": 381
},
{
"epoch": 0.64,
"learning_rate": 1.8658716344997812e-05,
"loss": 1.1818,
"step": 384
},
{
"epoch": 0.64,
"learning_rate": 1.8630898050591502e-05,
"loss": 1.101,
"step": 387
},
{
"epoch": 0.65,
"learning_rate": 1.860281541317138e-05,
"loss": 1.0709,
"step": 390
},
{
"epoch": 0.65,
"learning_rate": 1.857446929283902e-05,
"loss": 1.0668,
"step": 393
},
{
"epoch": 0.66,
"learning_rate": 1.8545860557765828e-05,
"loss": 1.0396,
"step": 396
},
{
"epoch": 0.66,
"learning_rate": 1.8516990084166444e-05,
"loss": 1.0795,
"step": 399
},
{
"epoch": 0.67,
"learning_rate": 1.8487858756271918e-05,
"loss": 1.0401,
"step": 402
},
{
"epoch": 0.67,
"learning_rate": 1.845846746630261e-05,
"loss": 1.0679,
"step": 405
},
{
"epoch": 0.68,
"learning_rate": 1.842881711444088e-05,
"loss": 1.0702,
"step": 408
},
{
"epoch": 0.68,
"learning_rate": 1.8398908608803518e-05,
"loss": 1.0489,
"step": 411
},
{
"epoch": 0.69,
"learning_rate": 1.8368742865413905e-05,
"loss": 1.0245,
"step": 414
},
{
"epoch": 0.69,
"learning_rate": 1.8338320808173998e-05,
"loss": 1.0603,
"step": 417
},
{
"epoch": 0.7,
"learning_rate": 1.8307643368836003e-05,
"loss": 1.0493,
"step": 420
},
{
"epoch": 0.7,
"learning_rate": 1.8276711486973838e-05,
"loss": 1.048,
"step": 423
},
{
"epoch": 0.71,
"learning_rate": 1.8245526109954385e-05,
"loss": 1.078,
"step": 426
},
{
"epoch": 0.71,
"learning_rate": 1.8214088192908433e-05,
"loss": 1.0251,
"step": 429
},
{
"epoch": 0.72,
"learning_rate": 1.818239869870146e-05,
"loss": 1.0558,
"step": 432
},
{
"epoch": 0.72,
"learning_rate": 1.815045859790413e-05,
"loss": 1.0433,
"step": 435
},
{
"epoch": 0.73,
"learning_rate": 1.811826886876255e-05,
"loss": 1.0484,
"step": 438
},
{
"epoch": 0.73,
"learning_rate": 1.8085830497168334e-05,
"loss": 1.0559,
"step": 441
},
{
"epoch": 0.74,
"learning_rate": 1.805314447662841e-05,
"loss": 1.0715,
"step": 444
},
{
"epoch": 0.74,
"learning_rate": 1.8020211808234556e-05,
"loss": 1.0418,
"step": 447
},
{
"epoch": 0.75,
"learning_rate": 1.7987033500632785e-05,
"loss": 1.0166,
"step": 450
},
{
"epoch": 0.75,
"learning_rate": 1.7953610569992415e-05,
"loss": 1.0427,
"step": 453
},
{
"epoch": 0.76,
"learning_rate": 1.7919944039974962e-05,
"loss": 1.0463,
"step": 456
},
{
"epoch": 0.76,
"learning_rate": 1.78860349417028e-05,
"loss": 1.0035,
"step": 459
},
{
"epoch": 0.77,
"learning_rate": 1.7851884313727547e-05,
"loss": 1.047,
"step": 462
},
{
"epoch": 0.77,
"learning_rate": 1.78174932019983e-05,
"loss": 1.0241,
"step": 465
},
{
"epoch": 0.78,
"learning_rate": 1.7782862659829553e-05,
"loss": 1.0308,
"step": 468
},
{
"epoch": 0.78,
"learning_rate": 1.7747993747868985e-05,
"loss": 1.021,
"step": 471
},
{
"epoch": 0.79,
"learning_rate": 1.7712887534064935e-05,
"loss": 1.0216,
"step": 474
},
{
"epoch": 0.79,
"learning_rate": 1.7677545093633713e-05,
"loss": 1.0228,
"step": 477
},
{
"epoch": 0.8,
"learning_rate": 1.7641967509026667e-05,
"loss": 1.02,
"step": 480
},
{
"epoch": 0.8,
"learning_rate": 1.7606155869897024e-05,
"loss": 1.0065,
"step": 483
},
{
"epoch": 0.81,
"learning_rate": 1.757011127306653e-05,
"loss": 1.0353,
"step": 486
},
{
"epoch": 0.81,
"learning_rate": 1.7533834822491832e-05,
"loss": 1.0346,
"step": 489
},
{
"epoch": 0.82,
"learning_rate": 1.7497327629230707e-05,
"loss": 1.0241,
"step": 492
},
{
"epoch": 0.82,
"learning_rate": 1.746059081140798e-05,
"loss": 1.0036,
"step": 495
},
{
"epoch": 0.83,
"learning_rate": 1.7423625494181334e-05,
"loss": 1.0195,
"step": 498
},
{
"epoch": 0.83,
"eval_loss": 0.9461708068847656,
"eval_runtime": 61.2738,
"eval_samples_per_second": 7.556,
"eval_steps_per_second": 0.947,
"step": 500
},
{
"epoch": 0.83,
"learning_rate": 1.7386432809706795e-05,
"loss": 1.0261,
"step": 501
},
{
"epoch": 0.84,
"learning_rate": 1.734901389710411e-05,
"loss": 1.0162,
"step": 504
},
{
"epoch": 0.84,
"learning_rate": 1.7311369902421814e-05,
"loss": 1.0056,
"step": 507
},
{
"epoch": 0.85,
"learning_rate": 1.727350197860216e-05,
"loss": 1.0025,
"step": 510
},
{
"epoch": 0.85,
"learning_rate": 1.723541128544579e-05,
"loss": 1.0088,
"step": 513
},
{
"epoch": 0.86,
"learning_rate": 1.7197098989576222e-05,
"loss": 1.0285,
"step": 516
},
{
"epoch": 0.86,
"learning_rate": 1.7158566264404112e-05,
"loss": 1.0105,
"step": 519
},
{
"epoch": 0.87,
"learning_rate": 1.7119814290091315e-05,
"loss": 1.0384,
"step": 522
},
{
"epoch": 0.87,
"learning_rate": 1.708084425351476e-05,
"loss": 1.0274,
"step": 525
},
{
"epoch": 0.88,
"learning_rate": 1.704165734823006e-05,
"loss": 1.0307,
"step": 528
},
{
"epoch": 0.88,
"learning_rate": 1.7002254774435e-05,
"loss": 1.0112,
"step": 531
},
{
"epoch": 0.89,
"learning_rate": 1.6962637738932746e-05,
"loss": 1.007,
"step": 534
},
{
"epoch": 0.89,
"learning_rate": 1.6922807455094898e-05,
"loss": 0.9999,
"step": 537
},
{
"epoch": 0.9,
"learning_rate": 1.688276514282431e-05,
"loss": 1.015,
"step": 540
},
{
"epoch": 0.9,
"learning_rate": 1.6842512028517773e-05,
"loss": 0.9912,
"step": 543
},
{
"epoch": 0.91,
"learning_rate": 1.6802049345028388e-05,
"loss": 1.0109,
"step": 546
},
{
"epoch": 0.91,
"learning_rate": 1.676137833162786e-05,
"loss": 1.0079,
"step": 549
},
{
"epoch": 0.92,
"learning_rate": 1.6720500233968522e-05,
"loss": 1.0352,
"step": 552
},
{
"epoch": 0.92,
"learning_rate": 1.6679416304045172e-05,
"loss": 1.0021,
"step": 555
},
{
"epoch": 0.93,
"learning_rate": 1.6638127800156754e-05,
"loss": 0.9867,
"step": 558
},
{
"epoch": 0.93,
"learning_rate": 1.65966359868678e-05,
"loss": 0.9974,
"step": 561
},
{
"epoch": 0.94,
"learning_rate": 1.65549421349697e-05,
"loss": 1.0049,
"step": 564
},
{
"epoch": 0.94,
"learning_rate": 1.6513047521441795e-05,
"loss": 1.0141,
"step": 567
},
{
"epoch": 0.95,
"learning_rate": 1.6470953429412257e-05,
"loss": 0.9925,
"step": 570
},
{
"epoch": 0.95,
"learning_rate": 1.6428661148118775e-05,
"loss": 1.0206,
"step": 573
},
{
"epoch": 0.96,
"learning_rate": 1.638617197286911e-05,
"loss": 1.0067,
"step": 576
},
{
"epoch": 0.96,
"learning_rate": 1.634348720500136e-05,
"loss": 0.9687,
"step": 579
},
{
"epoch": 0.97,
"learning_rate": 1.6300608151844184e-05,
"loss": 1.0217,
"step": 582
},
{
"epoch": 0.97,
"learning_rate": 1.6257536126676684e-05,
"loss": 0.9977,
"step": 585
},
{
"epoch": 0.98,
"learning_rate": 1.6214272448688228e-05,
"loss": 0.9951,
"step": 588
},
{
"epoch": 0.98,
"learning_rate": 1.6170818442938035e-05,
"loss": 0.994,
"step": 591
},
{
"epoch": 0.99,
"learning_rate": 1.6127175440314596e-05,
"loss": 0.9697,
"step": 594
},
{
"epoch": 0.99,
"learning_rate": 1.6083344777494896e-05,
"loss": 1.0084,
"step": 597
},
{
"epoch": 1.0,
"learning_rate": 1.6039327796903502e-05,
"loss": 0.9991,
"step": 600
},
{
"epoch": 1.0,
"learning_rate": 1.5995125846671417e-05,
"loss": 0.7961,
"step": 603
},
{
"epoch": 1.01,
"learning_rate": 1.5950740280594815e-05,
"loss": 0.6247,
"step": 606
},
{
"epoch": 1.01,
"learning_rate": 1.590617245809357e-05,
"loss": 0.6435,
"step": 609
},
{
"epoch": 1.02,
"learning_rate": 1.5861423744169608e-05,
"loss": 0.6282,
"step": 612
},
{
"epoch": 1.02,
"learning_rate": 1.5816495509365123e-05,
"loss": 0.6123,
"step": 615
},
{
"epoch": 1.03,
"learning_rate": 1.577138912972058e-05,
"loss": 0.6011,
"step": 618
},
{
"epoch": 1.03,
"learning_rate": 1.5726105986732582e-05,
"loss": 0.6344,
"step": 621
},
{
"epoch": 1.04,
"learning_rate": 1.568064746731156e-05,
"loss": 0.6108,
"step": 624
},
{
"epoch": 1.04,
"learning_rate": 1.5635014963739278e-05,
"loss": 0.6274,
"step": 627
},
{
"epoch": 1.05,
"learning_rate": 1.5589209873626215e-05,
"loss": 0.5958,
"step": 630
},
{
"epoch": 1.05,
"learning_rate": 1.5543233599868744e-05,
"loss": 0.6143,
"step": 633
},
{
"epoch": 1.06,
"learning_rate": 1.5497087550606167e-05,
"loss": 0.6181,
"step": 636
},
{
"epoch": 1.06,
"learning_rate": 1.5450773139177587e-05,
"loss": 0.6139,
"step": 639
},
{
"epoch": 1.07,
"learning_rate": 1.5404291784078633e-05,
"loss": 0.6214,
"step": 642
},
{
"epoch": 1.07,
"learning_rate": 1.5357644908917995e-05,
"loss": 0.5895,
"step": 645
},
{
"epoch": 1.08,
"learning_rate": 1.5310833942373833e-05,
"loss": 0.5976,
"step": 648
},
{
"epoch": 1.08,
"learning_rate": 1.526386031815003e-05,
"loss": 0.6034,
"step": 651
},
{
"epoch": 1.09,
"learning_rate": 1.5216725474932246e-05,
"loss": 0.6124,
"step": 654
},
{
"epoch": 1.09,
"learning_rate": 1.5169430856343909e-05,
"loss": 0.6143,
"step": 657
},
{
"epoch": 1.1,
"learning_rate": 1.5121977910901953e-05,
"loss": 0.6156,
"step": 660
},
{
"epoch": 1.1,
"learning_rate": 1.5074368091972476e-05,
"loss": 0.622,
"step": 663
},
{
"epoch": 1.11,
"learning_rate": 1.5026602857726216e-05,
"loss": 0.5983,
"step": 666
},
{
"epoch": 1.11,
"learning_rate": 1.4978683671093905e-05,
"loss": 0.601,
"step": 669
},
{
"epoch": 1.12,
"learning_rate": 1.4930611999721457e-05,
"loss": 0.5974,
"step": 672
},
{
"epoch": 1.12,
"learning_rate": 1.4882389315925002e-05,
"loss": 0.5977,
"step": 675
},
{
"epoch": 1.13,
"learning_rate": 1.483401709664582e-05,
"loss": 0.6019,
"step": 678
},
{
"epoch": 1.13,
"learning_rate": 1.4785496823405084e-05,
"loss": 0.608,
"step": 681
},
{
"epoch": 1.14,
"learning_rate": 1.4736829982258498e-05,
"loss": 0.6013,
"step": 684
},
{
"epoch": 1.14,
"learning_rate": 1.468801806375077e-05,
"loss": 0.5834,
"step": 687
},
{
"epoch": 1.15,
"learning_rate": 1.4639062562869986e-05,
"loss": 0.604,
"step": 690
},
{
"epoch": 1.15,
"learning_rate": 1.4589964979001779e-05,
"loss": 0.6028,
"step": 693
},
{
"epoch": 1.16,
"learning_rate": 1.4540726815883445e-05,
"loss": 0.5999,
"step": 696
},
{
"epoch": 1.16,
"learning_rate": 1.4491349581557884e-05,
"loss": 0.6015,
"step": 699
},
{
"epoch": 1.17,
"learning_rate": 1.4441834788327382e-05,
"loss": 0.6103,
"step": 702
},
{
"epoch": 1.17,
"learning_rate": 1.4392183952707324e-05,
"loss": 0.5958,
"step": 705
},
{
"epoch": 1.18,
"learning_rate": 1.4342398595379738e-05,
"loss": 0.6088,
"step": 708
},
{
"epoch": 1.18,
"learning_rate": 1.4292480241146715e-05,
"loss": 0.5966,
"step": 711
},
{
"epoch": 1.19,
"learning_rate": 1.4242430418883713e-05,
"loss": 0.5949,
"step": 714
},
{
"epoch": 1.19,
"learning_rate": 1.4192250661492724e-05,
"loss": 0.5962,
"step": 717
},
{
"epoch": 1.2,
"learning_rate": 1.414194250585534e-05,
"loss": 0.6081,
"step": 720
},
{
"epoch": 1.2,
"learning_rate": 1.409150749278567e-05,
"loss": 0.609,
"step": 723
},
{
"epoch": 1.21,
"learning_rate": 1.4040947166983147e-05,
"loss": 0.6169,
"step": 726
},
{
"epoch": 1.21,
"learning_rate": 1.3990263076985228e-05,
"loss": 0.6158,
"step": 729
},
{
"epoch": 1.22,
"learning_rate": 1.3939456775119962e-05,
"loss": 0.5725,
"step": 732
},
{
"epoch": 1.22,
"learning_rate": 1.3888529817458436e-05,
"loss": 0.6092,
"step": 735
},
{
"epoch": 1.23,
"learning_rate": 1.3837483763767128e-05,
"loss": 0.6091,
"step": 738
},
{
"epoch": 1.23,
"learning_rate": 1.3786320177460142e-05,
"loss": 0.629,
"step": 741
},
{
"epoch": 1.24,
"learning_rate": 1.3735040625551303e-05,
"loss": 0.5754,
"step": 744
},
{
"epoch": 1.24,
"learning_rate": 1.3683646678606176e-05,
"loss": 0.5961,
"step": 747
},
{
"epoch": 1.25,
"learning_rate": 1.3632139910693971e-05,
"loss": 0.5801,
"step": 750
},
{
"epoch": 1.25,
"learning_rate": 1.3580521899339316e-05,
"loss": 0.6093,
"step": 753
},
{
"epoch": 1.26,
"learning_rate": 1.3528794225473952e-05,
"loss": 0.5903,
"step": 756
},
{
"epoch": 1.26,
"learning_rate": 1.3476958473388315e-05,
"loss": 0.6145,
"step": 759
},
{
"epoch": 1.27,
"learning_rate": 1.3425016230683e-05,
"loss": 0.6015,
"step": 762
},
{
"epoch": 1.27,
"learning_rate": 1.3372969088220158e-05,
"loss": 0.5927,
"step": 765
},
{
"epoch": 1.28,
"learning_rate": 1.332081864007475e-05,
"loss": 0.6049,
"step": 768
},
{
"epoch": 1.28,
"learning_rate": 1.3268566483485734e-05,
"loss": 0.5824,
"step": 771
},
{
"epoch": 1.29,
"learning_rate": 1.3216214218807152e-05,
"loss": 0.593,
"step": 774
},
{
"epoch": 1.29,
"learning_rate": 1.3163763449459105e-05,
"loss": 0.5972,
"step": 777
},
{
"epoch": 1.3,
"learning_rate": 1.3111215781878648e-05,
"loss": 0.6,
"step": 780
},
{
"epoch": 1.3,
"learning_rate": 1.3058572825470588e-05,
"loss": 0.6063,
"step": 783
},
{
"epoch": 1.31,
"learning_rate": 1.3005836192558192e-05,
"loss": 0.6135,
"step": 786
},
{
"epoch": 1.31,
"learning_rate": 1.2953007498333807e-05,
"loss": 0.6082,
"step": 789
},
{
"epoch": 1.32,
"learning_rate": 1.2900088360809396e-05,
"loss": 0.5708,
"step": 792
},
{
"epoch": 1.32,
"learning_rate": 1.2847080400766962e-05,
"loss": 0.5901,
"step": 795
},
{
"epoch": 1.33,
"learning_rate": 1.2793985241708934e-05,
"loss": 0.612,
"step": 798
},
{
"epoch": 1.33,
"learning_rate": 1.274080450980843e-05,
"loss": 0.594,
"step": 801
},
{
"epoch": 1.34,
"learning_rate": 1.2687539833859444e-05,
"loss": 0.6105,
"step": 804
},
{
"epoch": 1.34,
"learning_rate": 1.2634192845226971e-05,
"loss": 0.6035,
"step": 807
},
{
"epoch": 1.35,
"learning_rate": 1.258076517779705e-05,
"loss": 0.5969,
"step": 810
},
{
"epoch": 1.35,
"learning_rate": 1.2527258467926688e-05,
"loss": 0.5916,
"step": 813
},
{
"epoch": 1.36,
"learning_rate": 1.2473674354393797e-05,
"loss": 0.6268,
"step": 816
},
{
"epoch": 1.36,
"learning_rate": 1.2420014478346944e-05,
"loss": 0.5937,
"step": 819
},
{
"epoch": 1.37,
"learning_rate": 1.2366280483255128e-05,
"loss": 0.5846,
"step": 822
},
{
"epoch": 1.37,
"learning_rate": 1.2312474014857432e-05,
"loss": 0.6201,
"step": 825
},
{
"epoch": 1.38,
"learning_rate": 1.2258596721112608e-05,
"loss": 0.5948,
"step": 828
},
{
"epoch": 1.38,
"learning_rate": 1.2204650252148615e-05,
"loss": 0.5824,
"step": 831
},
{
"epoch": 1.39,
"learning_rate": 1.2150636260212088e-05,
"loss": 0.5968,
"step": 834
},
{
"epoch": 1.39,
"learning_rate": 1.209655639961771e-05,
"loss": 0.6026,
"step": 837
},
{
"epoch": 1.4,
"learning_rate": 1.204241232669756e-05,
"loss": 0.6069,
"step": 840
},
{
"epoch": 1.4,
"learning_rate": 1.1988205699750391e-05,
"loss": 0.5881,
"step": 843
},
{
"epoch": 1.41,
"learning_rate": 1.1933938178990816e-05,
"loss": 0.5851,
"step": 846
},
{
"epoch": 1.41,
"learning_rate": 1.1879611426498488e-05,
"loss": 0.5966,
"step": 849
},
{
"epoch": 1.42,
"learning_rate": 1.1825227106167178e-05,
"loss": 0.5896,
"step": 852
},
{
"epoch": 1.42,
"learning_rate": 1.1770786883653806e-05,
"loss": 0.6103,
"step": 855
},
{
"epoch": 1.43,
"learning_rate": 1.1716292426327454e-05,
"loss": 0.6132,
"step": 858
},
{
"epoch": 1.43,
"learning_rate": 1.1661745403218276e-05,
"loss": 0.5875,
"step": 861
},
{
"epoch": 1.44,
"learning_rate": 1.1607147484966372e-05,
"loss": 0.5799,
"step": 864
},
{
"epoch": 1.44,
"learning_rate": 1.1552500343770658e-05,
"loss": 0.6214,
"step": 867
},
{
"epoch": 1.45,
"learning_rate": 1.149780565333761e-05,
"loss": 0.5999,
"step": 870
},
{
"epoch": 1.45,
"learning_rate": 1.1443065088830019e-05,
"loss": 0.5962,
"step": 873
},
{
"epoch": 1.46,
"learning_rate": 1.1388280326815697e-05,
"loss": 0.5969,
"step": 876
},
{
"epoch": 1.46,
"learning_rate": 1.1333453045216107e-05,
"loss": 0.5796,
"step": 879
},
{
"epoch": 1.47,
"learning_rate": 1.1278584923254981e-05,
"loss": 0.6056,
"step": 882
},
{
"epoch": 1.47,
"learning_rate": 1.1223677641406896e-05,
"loss": 0.5967,
"step": 885
},
{
"epoch": 1.48,
"learning_rate": 1.1168732881345794e-05,
"loss": 0.5935,
"step": 888
},
{
"epoch": 1.48,
"learning_rate": 1.1113752325893483e-05,
"loss": 0.602,
"step": 891
},
{
"epoch": 1.49,
"learning_rate": 1.1058737658968102e-05,
"loss": 0.5983,
"step": 894
},
{
"epoch": 1.49,
"learning_rate": 1.1003690565532522e-05,
"loss": 0.6014,
"step": 897
},
{
"epoch": 1.5,
"learning_rate": 1.0948612731542776e-05,
"loss": 0.5906,
"step": 900
},
{
"epoch": 1.5,
"learning_rate": 1.0893505843896403e-05,
"loss": 0.5803,
"step": 903
},
{
"epoch": 1.51,
"learning_rate": 1.0838371590380765e-05,
"loss": 0.5994,
"step": 906
},
{
"epoch": 1.51,
"learning_rate": 1.0783211659621396e-05,
"loss": 0.5852,
"step": 909
},
{
"epoch": 1.52,
"learning_rate": 1.0728027741030247e-05,
"loss": 0.6085,
"step": 912
},
{
"epoch": 1.52,
"learning_rate": 1.0672821524753953e-05,
"loss": 0.6101,
"step": 915
},
{
"epoch": 1.53,
"learning_rate": 1.061759470162209e-05,
"loss": 0.5844,
"step": 918
},
{
"epoch": 1.53,
"learning_rate": 1.056234896309535e-05,
"loss": 0.6055,
"step": 921
},
{
"epoch": 1.54,
"learning_rate": 1.050708600121377e-05,
"loss": 0.6062,
"step": 924
},
{
"epoch": 1.54,
"learning_rate": 1.0451807508544891e-05,
"loss": 0.5935,
"step": 927
},
{
"epoch": 1.55,
"learning_rate": 1.0396515178131925e-05,
"loss": 0.5854,
"step": 930
},
{
"epoch": 1.55,
"learning_rate": 1.0341210703441895e-05,
"loss": 0.5695,
"step": 933
},
{
"epoch": 1.56,
"learning_rate": 1.0285895778313783e-05,
"loss": 0.6004,
"step": 936
},
{
"epoch": 1.56,
"learning_rate": 1.0230572096906634e-05,
"loss": 0.5908,
"step": 939
},
{
"epoch": 1.57,
"learning_rate": 1.0175241353647678e-05,
"loss": 0.5852,
"step": 942
},
{
"epoch": 1.57,
"learning_rate": 1.0119905243180432e-05,
"loss": 0.5822,
"step": 945
},
{
"epoch": 1.58,
"learning_rate": 1.0064565460312794e-05,
"loss": 0.5818,
"step": 948
},
{
"epoch": 1.58,
"learning_rate": 1.000922369996515e-05,
"loss": 0.5944,
"step": 951
},
{
"epoch": 1.59,
"learning_rate": 9.953881657118438e-06,
"loss": 0.5785,
"step": 954
},
{
"epoch": 1.59,
"learning_rate": 9.898541026762252e-06,
"loss": 0.5894,
"step": 957
},
{
"epoch": 1.6,
"learning_rate": 9.843203503842928e-06,
"loss": 0.5947,
"step": 960
},
{
"epoch": 1.6,
"learning_rate": 9.787870783211633e-06,
"loss": 0.5969,
"step": 963
},
{
"epoch": 1.61,
"learning_rate": 9.732544559572443e-06,
"loss": 0.588,
"step": 966
},
{
"epoch": 1.61,
"learning_rate": 9.677226527430444e-06,
"loss": 0.5762,
"step": 969
},
{
"epoch": 1.62,
"learning_rate": 9.621918381039851e-06,
"loss": 0.5909,
"step": 972
},
{
"epoch": 1.62,
"learning_rate": 9.56662181435209e-06,
"loss": 0.5907,
"step": 975
},
{
"epoch": 1.63,
"learning_rate": 9.511338520963942e-06,
"loss": 0.5654,
"step": 978
},
{
"epoch": 1.63,
"learning_rate": 9.456070194065647e-06,
"loss": 0.5793,
"step": 981
},
{
"epoch": 1.64,
"learning_rate": 9.400818526389063e-06,
"loss": 0.5736,
"step": 984
},
{
"epoch": 1.64,
"learning_rate": 9.345585210155818e-06,
"loss": 0.5839,
"step": 987
},
{
"epoch": 1.65,
"learning_rate": 9.290371937025486e-06,
"loss": 0.5605,
"step": 990
},
{
"epoch": 1.65,
"learning_rate": 9.235180398043756e-06,
"loss": 0.5774,
"step": 993
},
{
"epoch": 1.66,
"learning_rate": 9.180012283590678e-06,
"loss": 0.5851,
"step": 996
},
{
"epoch": 1.66,
"learning_rate": 9.12486928332884e-06,
"loss": 0.5867,
"step": 999
},
{
"epoch": 1.66,
"eval_loss": 0.9312900900840759,
"eval_runtime": 61.0314,
"eval_samples_per_second": 7.586,
"eval_steps_per_second": 0.95,
"step": 1000
},
{
"epoch": 1.67,
"learning_rate": 9.069753086151657e-06,
"loss": 0.3263,
"step": 1002
},
{
"epoch": 1.67,
"learning_rate": 9.014665380131639e-06,
"loss": 0.3206,
"step": 1005
},
{
"epoch": 1.68,
"learning_rate": 8.959607852468667e-06,
"loss": 0.3393,
"step": 1008
},
{
"epoch": 1.68,
"learning_rate": 8.904582189438345e-06,
"loss": 0.3301,
"step": 1011
},
{
"epoch": 1.69,
"learning_rate": 8.849590076340344e-06,
"loss": 0.3327,
"step": 1014
},
{
"epoch": 1.69,
"learning_rate": 8.79463319744677e-06,
"loss": 0.3337,
"step": 1017
},
{
"epoch": 1.7,
"learning_rate": 8.739713235950608e-06,
"loss": 0.3504,
"step": 1020
},
{
"epoch": 1.7,
"learning_rate": 8.684831873914146e-06,
"loss": 0.3291,
"step": 1023
},
{
"epoch": 1.71,
"learning_rate": 8.629990792217464e-06,
"loss": 0.3301,
"step": 1026
},
{
"epoch": 1.71,
"learning_rate": 8.575191670506969e-06,
"loss": 0.3119,
"step": 1029
},
{
"epoch": 1.72,
"learning_rate": 8.520436187143921e-06,
"loss": 0.3326,
"step": 1032
},
{
"epoch": 1.72,
"learning_rate": 8.465726019153052e-06,
"loss": 0.3274,
"step": 1035
},
{
"epoch": 1.73,
"learning_rate": 8.411062842171198e-06,
"loss": 0.3294,
"step": 1038
},
{
"epoch": 1.73,
"learning_rate": 8.356448330395979e-06,
"loss": 0.3375,
"step": 1041
},
{
"epoch": 1.74,
"learning_rate": 8.301884156534511e-06,
"loss": 0.317,
"step": 1044
},
{
"epoch": 1.74,
"learning_rate": 8.247371991752201e-06,
"loss": 0.3359,
"step": 1047
},
{
"epoch": 1.75,
"learning_rate": 8.19291350562153e-06,
"loss": 0.3003,
"step": 1050
},
{
"epoch": 1.75,
"learning_rate": 8.138510366070937e-06,
"loss": 0.3194,
"step": 1053
},
{
"epoch": 1.76,
"learning_rate": 8.084164239333745e-06,
"loss": 0.3304,
"step": 1056
},
{
"epoch": 1.76,
"learning_rate": 8.029876789897102e-06,
"loss": 0.3219,
"step": 1059
},
{
"epoch": 1.77,
"learning_rate": 7.975649680451024e-06,
"loss": 0.3059,
"step": 1062
},
{
"epoch": 1.77,
"learning_rate": 7.921484571837465e-06,
"loss": 0.3342,
"step": 1065
},
{
"epoch": 1.78,
"learning_rate": 7.867383122999435e-06,
"loss": 0.3318,
"step": 1068
},
{
"epoch": 1.78,
"learning_rate": 7.813346990930209e-06,
"loss": 0.3303,
"step": 1071
},
{
"epoch": 1.79,
"learning_rate": 7.759377830622581e-06,
"loss": 0.3379,
"step": 1074
},
{
"epoch": 1.79,
"learning_rate": 7.705477295018148e-06,
"loss": 0.3267,
"step": 1077
},
{
"epoch": 1.8,
"learning_rate": 7.651647034956721e-06,
"loss": 0.3336,
"step": 1080
},
{
"epoch": 1.8,
"learning_rate": 7.597888699125731e-06,
"loss": 0.3346,
"step": 1083
},
{
"epoch": 1.81,
"learning_rate": 7.544203934009752e-06,
"loss": 0.3449,
"step": 1086
},
{
"epoch": 1.81,
"learning_rate": 7.4905943838400716e-06,
"loss": 0.3323,
"step": 1089
},
{
"epoch": 1.82,
"learning_rate": 7.437061690544331e-06,
"loss": 0.3114,
"step": 1092
},
{
"epoch": 1.82,
"learning_rate": 7.383607493696235e-06,
"loss": 0.3249,
"step": 1095
},
{
"epoch": 1.83,
"learning_rate": 7.330233430465334e-06,
"loss": 0.3248,
"step": 1098
},
{
"epoch": 1.83,
"learning_rate": 7.276941135566884e-06,
"loss": 0.3243,
"step": 1101
},
{
"epoch": 1.84,
"learning_rate": 7.223732241211783e-06,
"loss": 0.3255,
"step": 1104
},
{
"epoch": 1.84,
"learning_rate": 7.170608377056576e-06,
"loss": 0.3189,
"step": 1107
},
{
"epoch": 1.85,
"learning_rate": 7.117571170153543e-06,
"loss": 0.3145,
"step": 1110
},
{
"epoch": 1.85,
"learning_rate": 7.064622244900872e-06,
"loss": 0.3178,
"step": 1113
},
{
"epoch": 1.86,
"learning_rate": 7.011763222992891e-06,
"loss": 0.3007,
"step": 1116
},
{
"epoch": 1.86,
"learning_rate": 6.958995723370425e-06,
"loss": 0.3254,
"step": 1119
},
{
"epoch": 1.87,
"learning_rate": 6.906321362171189e-06,
"loss": 0.3263,
"step": 1122
},
{
"epoch": 1.87,
"learning_rate": 6.853741752680303e-06,
"loss": 0.3041,
"step": 1125
},
{
"epoch": 1.88,
"learning_rate": 6.801258505280876e-06,
"loss": 0.329,
"step": 1128
},
{
"epoch": 1.88,
"learning_rate": 6.74887322740469e-06,
"loss": 0.3324,
"step": 1131
},
{
"epoch": 1.89,
"learning_rate": 6.696587523482951e-06,
"loss": 0.3213,
"step": 1134
},
{
"epoch": 1.89,
"learning_rate": 6.644402994897171e-06,
"loss": 0.3221,
"step": 1137
},
{
"epoch": 1.9,
"learning_rate": 6.592321239930112e-06,
"loss": 0.3059,
"step": 1140
},
{
"epoch": 1.9,
"learning_rate": 6.540343853716834e-06,
"loss": 0.3248,
"step": 1143
},
{
"epoch": 1.91,
"learning_rate": 6.488472428195841e-06,
"loss": 0.3325,
"step": 1146
},
{
"epoch": 1.91,
"learning_rate": 6.436708552060317e-06,
"loss": 0.3132,
"step": 1149
},
{
"epoch": 1.92,
"learning_rate": 6.385053810709485e-06,
"loss": 0.3205,
"step": 1152
},
{
"epoch": 1.92,
"learning_rate": 6.333509786200032e-06,
"loss": 0.3095,
"step": 1155
},
{
"epoch": 1.93,
"learning_rate": 6.282078057197671e-06,
"loss": 0.3166,
"step": 1158
},
{
"epoch": 1.93,
"learning_rate": 6.230760198928774e-06,
"loss": 0.3172,
"step": 1161
},
{
"epoch": 1.94,
"learning_rate": 6.1795577831321405e-06,
"loss": 0.3251,
"step": 1164
},
{
"epoch": 1.94,
"learning_rate": 6.128472378010846e-06,
"loss": 0.3211,
"step": 1167
},
{
"epoch": 1.95,
"learning_rate": 6.077505548184226e-06,
"loss": 0.3091,
"step": 1170
},
{
"epoch": 1.95,
"learning_rate": 6.026658854639946e-06,
"loss": 0.3341,
"step": 1173
},
{
"epoch": 1.96,
"learning_rate": 5.975933854686196e-06,
"loss": 0.3016,
"step": 1176
},
{
"epoch": 1.96,
"learning_rate": 5.925332101903994e-06,
"loss": 0.333,
"step": 1179
},
{
"epoch": 1.97,
"learning_rate": 5.874855146099594e-06,
"loss": 0.3234,
"step": 1182
},
{
"epoch": 1.97,
"learning_rate": 5.824504533257042e-06,
"loss": 0.3256,
"step": 1185
},
{
"epoch": 1.98,
"learning_rate": 5.774281805490798e-06,
"loss": 0.3102,
"step": 1188
},
{
"epoch": 1.98,
"learning_rate": 5.724188500998531e-06,
"loss": 0.315,
"step": 1191
},
{
"epoch": 1.99,
"learning_rate": 5.674226154013988e-06,
"loss": 0.3095,
"step": 1194
},
{
"epoch": 1.99,
"learning_rate": 5.624396294760022e-06,
"loss": 0.3094,
"step": 1197
},
{
"epoch": 2.0,
"learning_rate": 5.574700449401702e-06,
"loss": 0.3182,
"step": 1200
},
{
"epoch": 2.0,
"learning_rate": 5.525140139999588e-06,
"loss": 0.2879,
"step": 1203
},
{
"epoch": 2.01,
"learning_rate": 5.475716884463113e-06,
"loss": 0.2313,
"step": 1206
},
{
"epoch": 2.01,
"learning_rate": 5.42643219650409e-06,
"loss": 0.2257,
"step": 1209
},
{
"epoch": 2.02,
"learning_rate": 5.377287585590352e-06,
"loss": 0.2328,
"step": 1212
},
{
"epoch": 2.02,
"learning_rate": 5.328284556899513e-06,
"loss": 0.239,
"step": 1215
},
{
"epoch": 2.03,
"learning_rate": 5.279424611272873e-06,
"loss": 0.2301,
"step": 1218
},
{
"epoch": 2.03,
"learning_rate": 5.230709245169461e-06,
"loss": 0.2333,
"step": 1221
},
{
"epoch": 2.04,
"learning_rate": 5.182139950620188e-06,
"loss": 0.2308,
"step": 1224
},
{
"epoch": 2.04,
"learning_rate": 5.133718215182162e-06,
"loss": 0.2562,
"step": 1227
},
{
"epoch": 2.05,
"learning_rate": 5.085445521893113e-06,
"loss": 0.2491,
"step": 1230
},
{
"epoch": 2.05,
"learning_rate": 5.037323349225984e-06,
"loss": 0.2534,
"step": 1233
},
{
"epoch": 2.06,
"learning_rate": 4.989353171043637e-06,
"loss": 0.2193,
"step": 1236
},
{
"epoch": 2.06,
"learning_rate": 4.94153645655373e-06,
"loss": 0.2435,
"step": 1239
},
{
"epoch": 2.07,
"learning_rate": 4.893874670263704e-06,
"loss": 0.2351,
"step": 1242
},
{
"epoch": 2.07,
"learning_rate": 4.846369271935932e-06,
"loss": 0.2399,
"step": 1245
},
{
"epoch": 2.08,
"learning_rate": 4.79902171654302e-06,
"loss": 0.2506,
"step": 1248
},
{
"epoch": 2.08,
"learning_rate": 4.7518334542232215e-06,
"loss": 0.239,
"step": 1251
},
{
"epoch": 2.09,
"learning_rate": 4.704805930236044e-06,
"loss": 0.2435,
"step": 1254
},
{
"epoch": 2.09,
"learning_rate": 4.657940584917983e-06,
"loss": 0.2505,
"step": 1257
},
{
"epoch": 2.1,
"learning_rate": 4.611238853638399e-06,
"loss": 0.2282,
"step": 1260
},
{
"epoch": 2.1,
"learning_rate": 4.564702166755565e-06,
"loss": 0.2302,
"step": 1263
},
{
"epoch": 2.11,
"learning_rate": 4.518331949572847e-06,
"loss": 0.2418,
"step": 1266
},
{
"epoch": 2.11,
"learning_rate": 4.472129622295055e-06,
"loss": 0.2361,
"step": 1269
},
{
"epoch": 2.12,
"learning_rate": 4.426096599984954e-06,
"loss": 0.2533,
"step": 1272
},
{
"epoch": 2.12,
"learning_rate": 4.380234292519917e-06,
"loss": 0.2389,
"step": 1275
},
{
"epoch": 2.13,
"learning_rate": 4.33454410454874e-06,
"loss": 0.2415,
"step": 1278
},
{
"epoch": 2.13,
"learning_rate": 4.289027435448633e-06,
"loss": 0.236,
"step": 1281
},
{
"epoch": 2.14,
"learning_rate": 4.243685679282343e-06,
"loss": 0.2363,
"step": 1284
},
{
"epoch": 2.14,
"learning_rate": 4.19852022475547e-06,
"loss": 0.2401,
"step": 1287
},
{
"epoch": 2.15,
"learning_rate": 4.153532455173939e-06,
"loss": 0.2428,
"step": 1290
},
{
"epoch": 2.15,
"learning_rate": 4.1087237484016185e-06,
"loss": 0.242,
"step": 1293
},
{
"epoch": 2.16,
"learning_rate": 4.064095476818133e-06,
"loss": 0.2352,
"step": 1296
},
{
"epoch": 2.16,
"learning_rate": 4.019649007276815e-06,
"loss": 0.2538,
"step": 1299
},
{
"epoch": 2.17,
"learning_rate": 3.9753857010628615e-06,
"loss": 0.2391,
"step": 1302
},
{
"epoch": 2.17,
"learning_rate": 3.931306913851621e-06,
"loss": 0.236,
"step": 1305
},
{
"epoch": 2.18,
"learning_rate": 3.887413995667091e-06,
"loss": 0.2344,
"step": 1308
},
{
"epoch": 2.18,
"learning_rate": 3.843708290840557e-06,
"loss": 0.2529,
"step": 1311
},
{
"epoch": 2.19,
"learning_rate": 3.8001911379694267e-06,
"loss": 0.2401,
"step": 1314
},
{
"epoch": 2.19,
"learning_rate": 3.756863869876224e-06,
"loss": 0.2384,
"step": 1317
},
{
"epoch": 2.2,
"learning_rate": 3.7137278135677714e-06,
"loss": 0.2271,
"step": 1320
},
{
"epoch": 2.2,
"learning_rate": 3.6707842901945546e-06,
"loss": 0.2273,
"step": 1323
},
{
"epoch": 2.21,
"learning_rate": 3.628034615010251e-06,
"loss": 0.2186,
"step": 1326
},
{
"epoch": 2.21,
"learning_rate": 3.58548009733145e-06,
"loss": 0.2526,
"step": 1329
},
{
"epoch": 2.22,
"learning_rate": 3.543122040497542e-06,
"loss": 0.2402,
"step": 1332
},
{
"epoch": 2.22,
"learning_rate": 3.5009617418308208e-06,
"loss": 0.2327,
"step": 1335
},
{
"epoch": 2.23,
"learning_rate": 3.4590004925967245e-06,
"loss": 0.2333,
"step": 1338
},
{
"epoch": 2.23,
"learning_rate": 3.417239577964312e-06,
"loss": 0.2345,
"step": 1341
},
{
"epoch": 2.24,
"learning_rate": 3.375680276966884e-06,
"loss": 0.2385,
"step": 1344
},
{
"epoch": 2.24,
"learning_rate": 3.3343238624628204e-06,
"loss": 0.2247,
"step": 1347
},
{
"epoch": 2.25,
"learning_rate": 3.2931716010965807e-06,
"loss": 0.2285,
"step": 1350
},
{
"epoch": 2.25,
"learning_rate": 3.2522247532599327e-06,
"loss": 0.251,
"step": 1353
},
{
"epoch": 2.26,
"learning_rate": 3.211484573053324e-06,
"loss": 0.2324,
"step": 1356
},
{
"epoch": 2.26,
"learning_rate": 3.1709523082474937e-06,
"loss": 0.2279,
"step": 1359
},
{
"epoch": 2.27,
"learning_rate": 3.130629200245249e-06,
"loss": 0.2438,
"step": 1362
},
{
"epoch": 2.27,
"learning_rate": 3.090516484043432e-06,
"loss": 0.2315,
"step": 1365
},
{
"epoch": 2.28,
"learning_rate": 3.0506153881951183e-06,
"loss": 0.2468,
"step": 1368
},
{
"epoch": 2.28,
"learning_rate": 3.010927134771966e-06,
"loss": 0.2221,
"step": 1371
},
{
"epoch": 2.29,
"learning_rate": 2.971452939326802e-06,
"loss": 0.2515,
"step": 1374
},
{
"epoch": 2.29,
"learning_rate": 2.9321940108563907e-06,
"loss": 0.2159,
"step": 1377
},
{
"epoch": 2.3,
"learning_rate": 2.8931515517644017e-06,
"loss": 0.2418,
"step": 1380
},
{
"epoch": 2.3,
"learning_rate": 2.854326757824576e-06,
"loss": 0.2311,
"step": 1383
},
{
"epoch": 2.31,
"learning_rate": 2.815720818144123e-06,
"loss": 0.2677,
"step": 1386
},
{
"epoch": 2.31,
"learning_rate": 2.7773349151272745e-06,
"loss": 0.2474,
"step": 1389
},
{
"epoch": 2.32,
"learning_rate": 2.7391702244390984e-06,
"loss": 0.2315,
"step": 1392
},
{
"epoch": 2.32,
"learning_rate": 2.7012279149694685e-06,
"loss": 0.2319,
"step": 1395
},
{
"epoch": 2.33,
"learning_rate": 2.663509148797281e-06,
"loss": 0.2272,
"step": 1398
},
{
"epoch": 2.33,
"learning_rate": 2.6260150811548434e-06,
"loss": 0.2451,
"step": 1401
},
{
"epoch": 2.34,
"learning_rate": 2.588746860392516e-06,
"loss": 0.2426,
"step": 1404
},
{
"epoch": 2.34,
"learning_rate": 2.5517056279435183e-06,
"loss": 0.238,
"step": 1407
},
{
"epoch": 2.35,
"learning_rate": 2.514892518288988e-06,
"loss": 0.2351,
"step": 1410
},
{
"epoch": 2.35,
"learning_rate": 2.47830865892323e-06,
"loss": 0.2278,
"step": 1413
},
{
"epoch": 2.36,
"learning_rate": 2.441955170319168e-06,
"loss": 0.2369,
"step": 1416
},
{
"epoch": 2.36,
"learning_rate": 2.405833165894055e-06,
"loss": 0.2356,
"step": 1419
},
{
"epoch": 2.37,
"learning_rate": 2.3699437519753444e-06,
"loss": 0.2281,
"step": 1422
},
{
"epoch": 2.37,
"learning_rate": 2.334288027766828e-06,
"loss": 0.2253,
"step": 1425
},
{
"epoch": 2.38,
"learning_rate": 2.2988670853149574e-06,
"loss": 0.2432,
"step": 1428
},
{
"epoch": 2.38,
"learning_rate": 2.2636820094754077e-06,
"loss": 0.2365,
"step": 1431
},
{
"epoch": 2.39,
"learning_rate": 2.2287338778798283e-06,
"loss": 0.2324,
"step": 1434
},
{
"epoch": 2.39,
"learning_rate": 2.1940237609028735e-06,
"loss": 0.2435,
"step": 1437
},
{
"epoch": 2.4,
"learning_rate": 2.159552721629381e-06,
"loss": 0.2275,
"step": 1440
},
{
"epoch": 2.4,
"learning_rate": 2.125321815821846e-06,
"loss": 0.2411,
"step": 1443
},
{
"epoch": 2.41,
"learning_rate": 2.0913320918880643e-06,
"loss": 0.2582,
"step": 1446
},
{
"epoch": 2.41,
"learning_rate": 2.0575845908490267e-06,
"loss": 0.2433,
"step": 1449
},
{
"epoch": 2.42,
"learning_rate": 2.0240803463070425e-06,
"loss": 0.2225,
"step": 1452
},
{
"epoch": 2.42,
"learning_rate": 1.990820384414075e-06,
"loss": 0.2155,
"step": 1455
},
{
"epoch": 2.43,
"learning_rate": 1.9578057238403124e-06,
"loss": 0.2397,
"step": 1458
},
{
"epoch": 2.43,
"learning_rate": 1.9250373757429753e-06,
"loss": 0.2453,
"step": 1461
},
{
"epoch": 2.44,
"learning_rate": 1.8925163437353465e-06,
"loss": 0.2407,
"step": 1464
},
{
"epoch": 2.44,
"learning_rate": 1.8602436238560218e-06,
"loss": 0.2512,
"step": 1467
},
{
"epoch": 2.45,
"learning_rate": 1.828220204538419e-06,
"loss": 0.2458,
"step": 1470
},
{
"epoch": 2.45,
"learning_rate": 1.7964470665804923e-06,
"loss": 0.2314,
"step": 1473
},
{
"epoch": 2.46,
"learning_rate": 1.7649251831147018e-06,
"loss": 0.2339,
"step": 1476
},
{
"epoch": 2.46,
"learning_rate": 1.733655519578209e-06,
"loss": 0.2383,
"step": 1479
},
{
"epoch": 2.47,
"learning_rate": 1.7026390336832933e-06,
"loss": 0.2307,
"step": 1482
},
{
"epoch": 2.47,
"learning_rate": 1.671876675388039e-06,
"loss": 0.2364,
"step": 1485
},
{
"epoch": 2.48,
"learning_rate": 1.6413693868672332e-06,
"loss": 0.2278,
"step": 1488
},
{
"epoch": 2.48,
"learning_rate": 1.6111181024834999e-06,
"loss": 0.2282,
"step": 1491
},
{
"epoch": 2.49,
"learning_rate": 1.5811237487586973e-06,
"loss": 0.2436,
"step": 1494
},
{
"epoch": 2.49,
"learning_rate": 1.5513872443455358e-06,
"loss": 0.2206,
"step": 1497
},
{
"epoch": 2.5,
"learning_rate": 1.5219094999994322e-06,
"loss": 0.2405,
"step": 1500
},
{
"epoch": 2.5,
"eval_loss": 0.9922696948051453,
"eval_runtime": 61.3926,
"eval_samples_per_second": 7.542,
"eval_steps_per_second": 0.945,
"step": 1500
},
{
"epoch": 2.5,
"learning_rate": 1.4926914185506315e-06,
"loss": 0.2409,
"step": 1503
},
{
"epoch": 2.51,
"learning_rate": 1.4637338948765467e-06,
"loss": 0.2313,
"step": 1506
},
{
"epoch": 2.51,
"learning_rate": 1.4350378158743484e-06,
"loss": 0.2355,
"step": 1509
},
{
"epoch": 2.52,
"learning_rate": 1.4066040604338104e-06,
"loss": 0.2309,
"step": 1512
},
{
"epoch": 2.52,
"learning_rate": 1.3784334994103787e-06,
"loss": 0.2225,
"step": 1515
},
{
"epoch": 2.53,
"learning_rate": 1.3505269955985122e-06,
"loss": 0.2376,
"step": 1518
},
{
"epoch": 2.53,
"learning_rate": 1.322885403705254e-06,
"loss": 0.2168,
"step": 1521
},
{
"epoch": 2.54,
"learning_rate": 1.2955095703240438e-06,
"loss": 0.2493,
"step": 1524
},
{
"epoch": 2.54,
"learning_rate": 1.2684003339088024e-06,
"loss": 0.2473,
"step": 1527
},
{
"epoch": 2.55,
"learning_rate": 1.2415585247482498e-06,
"loss": 0.2251,
"step": 1530
},
{
"epoch": 2.55,
"learning_rate": 1.21498496494046e-06,
"loss": 0.2266,
"step": 1533
},
{
"epoch": 2.56,
"learning_rate": 1.1886804683677068e-06,
"loss": 0.2196,
"step": 1536
},
{
"epoch": 2.56,
"learning_rate": 1.1626458406715192e-06,
"loss": 0.2298,
"step": 1539
},
{
"epoch": 2.57,
"learning_rate": 1.1368818792280089e-06,
"loss": 0.2332,
"step": 1542
},
{
"epoch": 2.57,
"learning_rate": 1.1113893731234537e-06,
"loss": 0.2366,
"step": 1545
},
{
"epoch": 2.58,
"learning_rate": 1.0861691031301313e-06,
"loss": 0.2327,
"step": 1548
},
{
"epoch": 2.58,
"learning_rate": 1.0612218416823926e-06,
"loss": 0.2244,
"step": 1551
},
{
"epoch": 2.59,
"learning_rate": 1.0365483528530206e-06,
"loss": 0.2349,
"step": 1554
},
{
"epoch": 2.59,
"learning_rate": 1.012149392329822e-06,
"loss": 0.248,
"step": 1557
},
{
"epoch": 2.6,
"learning_rate": 9.88025707392475e-07,
"loss": 0.226,
"step": 1560
},
{
"epoch": 2.6,
"learning_rate": 9.641780368896592e-07,
"loss": 0.2437,
"step": 1563
},
{
"epoch": 2.6,
"learning_rate": 9.406071112164061e-07,
"loss": 0.2188,
"step": 1566
},
{
"epoch": 2.61,
"learning_rate": 9.173136522917458e-07,
"loss": 0.2368,
"step": 1569
},
{
"epoch": 2.61,
"learning_rate": 8.942983735365885e-07,
"loss": 0.2409,
"step": 1572
},
{
"epoch": 2.62,
"learning_rate": 8.715619798518715e-07,
"loss": 0.2313,
"step": 1575
},
{
"epoch": 2.62,
"learning_rate": 8.491051675969774e-07,
"loss": 0.2467,
"step": 1578
},
{
"epoch": 2.63,
"learning_rate": 8.269286245684038e-07,
"loss": 0.2276,
"step": 1581
},
{
"epoch": 2.63,
"learning_rate": 8.050330299786913e-07,
"loss": 0.2377,
"step": 1584
},
{
"epoch": 2.64,
"learning_rate": 7.834190544356312e-07,
"loss": 0.2311,
"step": 1587
},
{
"epoch": 2.64,
"learning_rate": 7.620873599217215e-07,
"loss": 0.2366,
"step": 1590
},
{
"epoch": 2.65,
"learning_rate": 7.410385997738867e-07,
"loss": 0.2264,
"step": 1593
},
{
"epoch": 2.65,
"learning_rate": 7.202734186634785e-07,
"loss": 0.2235,
"step": 1596
},
{
"epoch": 2.66,
"learning_rate": 6.997924525765209e-07,
"loss": 0.2264,
"step": 1599
},
{
"epoch": 2.66,
"learning_rate": 6.79596328794242e-07,
"loss": 0.2267,
"step": 1602
},
{
"epoch": 2.67,
"learning_rate": 6.596856658738504e-07,
"loss": 0.223,
"step": 1605
},
{
"epoch": 2.67,
"learning_rate": 6.40061073629602e-07,
"loss": 0.2284,
"step": 1608
},
{
"epoch": 2.68,
"learning_rate": 6.20723153114109e-07,
"loss": 0.2485,
"step": 1611
},
{
"epoch": 2.68,
"learning_rate": 6.016724965999487e-07,
"loss": 0.2402,
"step": 1614
},
{
"epoch": 2.69,
"learning_rate": 5.829096875615048e-07,
"loss": 0.2168,
"step": 1617
},
{
"epoch": 2.69,
"learning_rate": 5.644353006571124e-07,
"loss": 0.2476,
"step": 1620
},
{
"epoch": 2.7,
"learning_rate": 5.462499017114508e-07,
"loss": 0.2362,
"step": 1623
},
{
"epoch": 2.7,
"learning_rate": 5.283540476982108e-07,
"loss": 0.2446,
"step": 1626
},
{
"epoch": 2.71,
"learning_rate": 5.107482867230462e-07,
"loss": 0.2244,
"step": 1629
},
{
"epoch": 2.71,
"learning_rate": 4.934331580067719e-07,
"loss": 0.2285,
"step": 1632
},
{
"epoch": 2.72,
"learning_rate": 4.7640919186886294e-07,
"loss": 0.2382,
"step": 1635
},
{
"epoch": 2.72,
"learning_rate": 4.596769097112042e-07,
"loss": 0.2369,
"step": 1638
},
{
"epoch": 2.73,
"learning_rate": 4.4323682400212675e-07,
"loss": 0.2276,
"step": 1641
},
{
"epoch": 2.73,
"learning_rate": 4.2708943826070157e-07,
"loss": 0.2217,
"step": 1644
},
{
"epoch": 2.74,
"learning_rate": 4.1123524704133276e-07,
"loss": 0.2245,
"step": 1647
},
{
"epoch": 2.74,
"learning_rate": 3.956747359185964e-07,
"loss": 0.251,
"step": 1650
},
{
"epoch": 2.75,
"learning_rate": 3.804083814723791e-07,
"loss": 0.2263,
"step": 1653
},
{
"epoch": 2.75,
"learning_rate": 3.6543665127327765e-07,
"loss": 0.2521,
"step": 1656
},
{
"epoch": 2.76,
"learning_rate": 3.5076000386827903e-07,
"loss": 0.2253,
"step": 1659
},
{
"epoch": 2.76,
"learning_rate": 3.3637888876671076e-07,
"loss": 0.2405,
"step": 1662
},
{
"epoch": 2.77,
"learning_rate": 3.2229374642648193e-07,
"loss": 0.2062,
"step": 1665
},
{
"epoch": 2.77,
"learning_rate": 3.085050082405916e-07,
"loss": 0.2409,
"step": 1668
},
{
"epoch": 2.78,
"learning_rate": 2.9501309652391063e-07,
"loss": 0.2245,
"step": 1671
},
{
"epoch": 2.78,
"learning_rate": 2.818184245002542e-07,
"loss": 0.2338,
"step": 1674
},
{
"epoch": 2.79,
"learning_rate": 2.689213962897186e-07,
"loss": 0.2306,
"step": 1677
},
{
"epoch": 2.79,
"learning_rate": 2.5632240689631216e-07,
"loss": 0.2255,
"step": 1680
},
{
"epoch": 2.8,
"learning_rate": 2.4402184219584735e-07,
"loss": 0.224,
"step": 1683
},
{
"epoch": 2.8,
"learning_rate": 2.3202007892413447e-07,
"loss": 0.2341,
"step": 1686
},
{
"epoch": 2.81,
"learning_rate": 2.2031748466543214e-07,
"loss": 0.244,
"step": 1689
},
{
"epoch": 2.81,
"learning_rate": 2.0891441784119725e-07,
"loss": 0.2269,
"step": 1692
},
{
"epoch": 2.82,
"learning_rate": 1.9781122769910065e-07,
"loss": 0.2213,
"step": 1695
},
{
"epoch": 2.82,
"learning_rate": 1.8700825430233216e-07,
"loss": 0.2259,
"step": 1698
},
{
"epoch": 2.83,
"learning_rate": 1.765058285191923e-07,
"loss": 0.2355,
"step": 1701
},
{
"epoch": 2.83,
"learning_rate": 1.663042720129504e-07,
"loss": 0.2335,
"step": 1704
},
{
"epoch": 2.84,
"learning_rate": 1.5640389723199478e-07,
"loss": 0.252,
"step": 1707
},
{
"epoch": 2.84,
"learning_rate": 1.468050074002658e-07,
"loss": 0.223,
"step": 1710
},
{
"epoch": 2.85,
"learning_rate": 1.375078965079657e-07,
"loss": 0.2249,
"step": 1713
},
{
"epoch": 2.85,
"learning_rate": 1.2851284930255447e-07,
"loss": 0.249,
"step": 1716
},
{
"epoch": 2.86,
"learning_rate": 1.1982014128003373e-07,
"loss": 0.2375,
"step": 1719
},
{
"epoch": 2.86,
"learning_rate": 1.1143003867650437e-07,
"loss": 0.2182,
"step": 1722
},
{
"epoch": 2.87,
"learning_rate": 1.0334279846001106e-07,
"loss": 0.2249,
"step": 1725
},
{
"epoch": 2.87,
"learning_rate": 9.555866832267502e-08,
"loss": 0.2259,
"step": 1728
},
{
"epoch": 2.88,
"learning_rate": 8.807788667311023e-08,
"loss": 0.2455,
"step": 1731
},
{
"epoch": 2.88,
"learning_rate": 8.090068262911366e-08,
"loss": 0.2314,
"step": 1734
},
{
"epoch": 2.89,
"learning_rate": 7.402727601065529e-08,
"loss": 0.2351,
"step": 1737
},
{
"epoch": 2.89,
"learning_rate": 6.745787733314469e-08,
"loss": 0.2203,
"step": 1740
},
{
"epoch": 2.9,
"learning_rate": 6.11926878009772e-08,
"loss": 0.2377,
"step": 1743
},
{
"epoch": 2.9,
"learning_rate": 5.5231899301380065e-08,
"loss": 0.23,
"step": 1746
},
{
"epoch": 2.91,
"learning_rate": 4.957569439853038e-08,
"loss": 0.2136,
"step": 1749
},
{
"epoch": 2.91,
"learning_rate": 4.422424632796185e-08,
"loss": 0.2231,
"step": 1752
},
{
"epoch": 2.92,
"learning_rate": 3.917771899126677e-08,
"loss": 0.2499,
"step": 1755
},
{
"epoch": 2.92,
"learning_rate": 3.4436266951067834e-08,
"loss": 0.2348,
"step": 1758
},
{
"epoch": 2.93,
"learning_rate": 3.000003542628749e-08,
"loss": 0.2341,
"step": 1761
},
{
"epoch": 2.93,
"learning_rate": 2.586916028770259e-08,
"loss": 0.2424,
"step": 1764
},
{
"epoch": 2.94,
"learning_rate": 2.2043768053777726e-08,
"loss": 0.2495,
"step": 1767
},
{
"epoch": 2.94,
"learning_rate": 1.852397588679722e-08,
"loss": 0.2498,
"step": 1770
},
{
"epoch": 2.95,
"learning_rate": 1.5309891589267988e-08,
"loss": 0.2203,
"step": 1773
},
{
"epoch": 2.95,
"learning_rate": 1.2401613600627749e-08,
"loss": 0.2216,
"step": 1776
},
{
"epoch": 2.96,
"learning_rate": 9.799230994220754e-09,
"loss": 0.2279,
"step": 1779
},
{
"epoch": 2.96,
"learning_rate": 7.502823474576648e-09,
"loss": 0.2379,
"step": 1782
},
{
"epoch": 2.97,
"learning_rate": 5.512461374964639e-09,
"loss": 0.2306,
"step": 1785
},
{
"epoch": 2.97,
"learning_rate": 3.828205655244111e-09,
"loss": 0.2546,
"step": 1788
},
{
"epoch": 2.98,
"learning_rate": 2.4501078999916716e-09,
"loss": 0.2396,
"step": 1791
},
{
"epoch": 2.98,
"learning_rate": 1.3782103169246441e-09,
"loss": 0.2226,
"step": 1794
},
{
"epoch": 2.99,
"learning_rate": 6.125457356120912e-10,
"loss": 0.2357,
"step": 1797
},
{
"epoch": 2.99,
"learning_rate": 1.5313760645896403e-10,
"loss": 0.2264,
"step": 1800
},
{
"epoch": 3.0,
"learning_rate": 0.0,
"loss": 0.236,
"step": 1803
},
{
"epoch": 3.0,
"step": 1803,
"total_flos": 1.9808467849371766e+20,
"train_loss": 0.11453570330466949,
"train_runtime": 197727.0226,
"train_samples_per_second": 7.005,
"train_steps_per_second": 0.009
}
],
"logging_steps": 3,
"max_steps": 1803,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.9808467849371766e+20,
"train_batch_size": 96,
"trial_name": null,
"trial_params": null
}