tinyllama-1.1b-sum-sft-qlora / trainer_state.json
martimfasantos's picture
Model save
80007e5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9996613039796782,
"eval_steps": 500,
"global_step": 5904,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.671875,
"learning_rate": 3.38409475465313e-07,
"loss": 2.4832,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.546875,
"learning_rate": 1.6920473773265652e-06,
"loss": 2.5282,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 0.71484375,
"learning_rate": 3.3840947546531303e-06,
"loss": 2.5149,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.8671875,
"learning_rate": 5.076142131979695e-06,
"loss": 2.5184,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.88671875,
"learning_rate": 6.768189509306261e-06,
"loss": 2.5026,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.7734375,
"learning_rate": 8.460236886632826e-06,
"loss": 2.5038,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 0.59375,
"learning_rate": 1.015228426395939e-05,
"loss": 2.4692,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 0.5546875,
"learning_rate": 1.1844331641285957e-05,
"loss": 2.4495,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 0.47265625,
"learning_rate": 1.3536379018612521e-05,
"loss": 2.4364,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 0.34765625,
"learning_rate": 1.5228426395939088e-05,
"loss": 2.4702,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": 0.259765625,
"learning_rate": 1.6920473773265652e-05,
"loss": 2.4237,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 0.2265625,
"learning_rate": 1.8612521150592218e-05,
"loss": 2.402,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 0.19140625,
"learning_rate": 2.030456852791878e-05,
"loss": 2.3978,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 0.1708984375,
"learning_rate": 2.199661590524535e-05,
"loss": 2.3895,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 0.1767578125,
"learning_rate": 2.3688663282571914e-05,
"loss": 2.4104,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 0.1552734375,
"learning_rate": 2.5380710659898476e-05,
"loss": 2.3876,
"step": 75
},
{
"epoch": 0.03,
"grad_norm": 0.162109375,
"learning_rate": 2.7072758037225043e-05,
"loss": 2.4002,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 0.1650390625,
"learning_rate": 2.876480541455161e-05,
"loss": 2.3434,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": 0.1796875,
"learning_rate": 3.0456852791878175e-05,
"loss": 2.3395,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 0.16015625,
"learning_rate": 3.214890016920474e-05,
"loss": 2.3176,
"step": 95
},
{
"epoch": 0.03,
"grad_norm": 0.1513671875,
"learning_rate": 3.3840947546531304e-05,
"loss": 2.3123,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 0.1552734375,
"learning_rate": 3.553299492385787e-05,
"loss": 2.3051,
"step": 105
},
{
"epoch": 0.04,
"grad_norm": 0.138671875,
"learning_rate": 3.7225042301184437e-05,
"loss": 2.3203,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 0.1416015625,
"learning_rate": 3.8917089678510996e-05,
"loss": 2.2931,
"step": 115
},
{
"epoch": 0.04,
"grad_norm": 0.1435546875,
"learning_rate": 4.060913705583756e-05,
"loss": 2.307,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 0.138671875,
"learning_rate": 4.230118443316413e-05,
"loss": 2.2731,
"step": 125
},
{
"epoch": 0.04,
"grad_norm": 0.134765625,
"learning_rate": 4.39932318104907e-05,
"loss": 2.2846,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": 0.134765625,
"learning_rate": 4.568527918781726e-05,
"loss": 2.256,
"step": 135
},
{
"epoch": 0.05,
"grad_norm": 0.1328125,
"learning_rate": 4.737732656514383e-05,
"loss": 2.2524,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 0.1298828125,
"learning_rate": 4.906937394247039e-05,
"loss": 2.2717,
"step": 145
},
{
"epoch": 0.05,
"grad_norm": 0.1318359375,
"learning_rate": 5.076142131979695e-05,
"loss": 2.2681,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 0.1279296875,
"learning_rate": 5.245346869712352e-05,
"loss": 2.2574,
"step": 155
},
{
"epoch": 0.05,
"grad_norm": 0.130859375,
"learning_rate": 5.4145516074450085e-05,
"loss": 2.2448,
"step": 160
},
{
"epoch": 0.06,
"grad_norm": 0.1318359375,
"learning_rate": 5.583756345177665e-05,
"loss": 2.2111,
"step": 165
},
{
"epoch": 0.06,
"grad_norm": 0.1396484375,
"learning_rate": 5.752961082910322e-05,
"loss": 2.2193,
"step": 170
},
{
"epoch": 0.06,
"grad_norm": 0.1376953125,
"learning_rate": 5.9221658206429784e-05,
"loss": 2.2512,
"step": 175
},
{
"epoch": 0.06,
"grad_norm": 0.1357421875,
"learning_rate": 6.091370558375635e-05,
"loss": 2.2315,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 0.142578125,
"learning_rate": 6.26057529610829e-05,
"loss": 2.2352,
"step": 185
},
{
"epoch": 0.06,
"grad_norm": 0.140625,
"learning_rate": 6.429780033840948e-05,
"loss": 2.2084,
"step": 190
},
{
"epoch": 0.07,
"grad_norm": 0.140625,
"learning_rate": 6.598984771573604e-05,
"loss": 2.2269,
"step": 195
},
{
"epoch": 0.07,
"grad_norm": 0.1416015625,
"learning_rate": 6.768189509306261e-05,
"loss": 2.2031,
"step": 200
},
{
"epoch": 0.07,
"grad_norm": 0.1435546875,
"learning_rate": 6.937394247038918e-05,
"loss": 2.2121,
"step": 205
},
{
"epoch": 0.07,
"grad_norm": 0.1376953125,
"learning_rate": 7.106598984771574e-05,
"loss": 2.2062,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 0.138671875,
"learning_rate": 7.275803722504231e-05,
"loss": 2.2315,
"step": 215
},
{
"epoch": 0.07,
"grad_norm": 0.142578125,
"learning_rate": 7.445008460236887e-05,
"loss": 2.209,
"step": 220
},
{
"epoch": 0.08,
"grad_norm": 0.1435546875,
"learning_rate": 7.614213197969543e-05,
"loss": 2.2177,
"step": 225
},
{
"epoch": 0.08,
"grad_norm": 0.1455078125,
"learning_rate": 7.783417935702199e-05,
"loss": 2.2296,
"step": 230
},
{
"epoch": 0.08,
"grad_norm": 0.15234375,
"learning_rate": 7.952622673434857e-05,
"loss": 2.2274,
"step": 235
},
{
"epoch": 0.08,
"grad_norm": 0.150390625,
"learning_rate": 8.121827411167512e-05,
"loss": 2.185,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 0.15234375,
"learning_rate": 8.29103214890017e-05,
"loss": 2.179,
"step": 245
},
{
"epoch": 0.08,
"grad_norm": 0.1494140625,
"learning_rate": 8.460236886632826e-05,
"loss": 2.2004,
"step": 250
},
{
"epoch": 0.09,
"grad_norm": 0.1494140625,
"learning_rate": 8.629441624365483e-05,
"loss": 2.2298,
"step": 255
},
{
"epoch": 0.09,
"grad_norm": 0.1435546875,
"learning_rate": 8.79864636209814e-05,
"loss": 2.2295,
"step": 260
},
{
"epoch": 0.09,
"grad_norm": 0.1513671875,
"learning_rate": 8.967851099830795e-05,
"loss": 2.2362,
"step": 265
},
{
"epoch": 0.09,
"grad_norm": 0.1494140625,
"learning_rate": 9.137055837563452e-05,
"loss": 2.2204,
"step": 270
},
{
"epoch": 0.09,
"grad_norm": 0.1513671875,
"learning_rate": 9.306260575296108e-05,
"loss": 2.1898,
"step": 275
},
{
"epoch": 0.09,
"grad_norm": 0.1474609375,
"learning_rate": 9.475465313028765e-05,
"loss": 2.1788,
"step": 280
},
{
"epoch": 0.1,
"grad_norm": 0.15625,
"learning_rate": 9.644670050761421e-05,
"loss": 2.1784,
"step": 285
},
{
"epoch": 0.1,
"grad_norm": 0.150390625,
"learning_rate": 9.813874788494079e-05,
"loss": 2.1903,
"step": 290
},
{
"epoch": 0.1,
"grad_norm": 0.1474609375,
"learning_rate": 9.983079526226735e-05,
"loss": 2.1735,
"step": 295
},
{
"epoch": 0.1,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001015228426395939,
"loss": 2.1936,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 0.150390625,
"learning_rate": 0.00010321489001692048,
"loss": 2.1986,
"step": 305
},
{
"epoch": 0.1,
"grad_norm": 0.1591796875,
"learning_rate": 0.00010490693739424704,
"loss": 2.1946,
"step": 310
},
{
"epoch": 0.11,
"grad_norm": 0.16015625,
"learning_rate": 0.00010659898477157362,
"loss": 2.2217,
"step": 315
},
{
"epoch": 0.11,
"grad_norm": 0.1572265625,
"learning_rate": 0.00010829103214890017,
"loss": 2.2082,
"step": 320
},
{
"epoch": 0.11,
"grad_norm": 0.1572265625,
"learning_rate": 0.00010998307952622673,
"loss": 2.1782,
"step": 325
},
{
"epoch": 0.11,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001116751269035533,
"loss": 2.1853,
"step": 330
},
{
"epoch": 0.11,
"grad_norm": 0.158203125,
"learning_rate": 0.00011336717428087986,
"loss": 2.1776,
"step": 335
},
{
"epoch": 0.12,
"grad_norm": 0.158203125,
"learning_rate": 0.00011505922165820644,
"loss": 2.1847,
"step": 340
},
{
"epoch": 0.12,
"grad_norm": 0.162109375,
"learning_rate": 0.000116751269035533,
"loss": 2.2162,
"step": 345
},
{
"epoch": 0.12,
"grad_norm": 0.15625,
"learning_rate": 0.00011844331641285957,
"loss": 2.1784,
"step": 350
},
{
"epoch": 0.12,
"grad_norm": 0.1640625,
"learning_rate": 0.00012013536379018613,
"loss": 2.1957,
"step": 355
},
{
"epoch": 0.12,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001218274111675127,
"loss": 2.1975,
"step": 360
},
{
"epoch": 0.12,
"grad_norm": 0.1611328125,
"learning_rate": 0.00012351945854483927,
"loss": 2.2133,
"step": 365
},
{
"epoch": 0.13,
"grad_norm": 0.1650390625,
"learning_rate": 0.0001252115059221658,
"loss": 2.1745,
"step": 370
},
{
"epoch": 0.13,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001269035532994924,
"loss": 2.1744,
"step": 375
},
{
"epoch": 0.13,
"grad_norm": 0.158203125,
"learning_rate": 0.00012859560067681895,
"loss": 2.19,
"step": 380
},
{
"epoch": 0.13,
"grad_norm": 0.158203125,
"learning_rate": 0.00013028764805414554,
"loss": 2.2186,
"step": 385
},
{
"epoch": 0.13,
"grad_norm": 0.162109375,
"learning_rate": 0.00013197969543147207,
"loss": 2.1992,
"step": 390
},
{
"epoch": 0.13,
"grad_norm": 0.16015625,
"learning_rate": 0.00013367174280879866,
"loss": 2.1905,
"step": 395
},
{
"epoch": 0.14,
"grad_norm": 0.158203125,
"learning_rate": 0.00013536379018612522,
"loss": 2.1836,
"step": 400
},
{
"epoch": 0.14,
"grad_norm": 0.15625,
"learning_rate": 0.00013705583756345178,
"loss": 2.2043,
"step": 405
},
{
"epoch": 0.14,
"grad_norm": 0.158203125,
"learning_rate": 0.00013874788494077836,
"loss": 2.2061,
"step": 410
},
{
"epoch": 0.14,
"grad_norm": 0.150390625,
"learning_rate": 0.0001404399323181049,
"loss": 2.1865,
"step": 415
},
{
"epoch": 0.14,
"grad_norm": 0.1552734375,
"learning_rate": 0.00014213197969543148,
"loss": 2.1918,
"step": 420
},
{
"epoch": 0.14,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014382402707275804,
"loss": 2.188,
"step": 425
},
{
"epoch": 0.15,
"grad_norm": 0.15234375,
"learning_rate": 0.00014551607445008463,
"loss": 2.1819,
"step": 430
},
{
"epoch": 0.15,
"grad_norm": 0.1611328125,
"learning_rate": 0.00014720812182741116,
"loss": 2.1639,
"step": 435
},
{
"epoch": 0.15,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014890016920473775,
"loss": 2.1927,
"step": 440
},
{
"epoch": 0.15,
"grad_norm": 0.15625,
"learning_rate": 0.0001505922165820643,
"loss": 2.1952,
"step": 445
},
{
"epoch": 0.15,
"grad_norm": 0.1669921875,
"learning_rate": 0.00015228426395939087,
"loss": 2.1961,
"step": 450
},
{
"epoch": 0.15,
"grad_norm": 0.1484375,
"learning_rate": 0.00015397631133671742,
"loss": 2.1983,
"step": 455
},
{
"epoch": 0.16,
"grad_norm": 0.1474609375,
"learning_rate": 0.00015566835871404398,
"loss": 2.1705,
"step": 460
},
{
"epoch": 0.16,
"grad_norm": 0.15234375,
"learning_rate": 0.00015736040609137057,
"loss": 2.1999,
"step": 465
},
{
"epoch": 0.16,
"grad_norm": 0.1533203125,
"learning_rate": 0.00015905245346869713,
"loss": 2.1524,
"step": 470
},
{
"epoch": 0.16,
"grad_norm": 0.146484375,
"learning_rate": 0.00016074450084602372,
"loss": 2.1903,
"step": 475
},
{
"epoch": 0.16,
"grad_norm": 0.1494140625,
"learning_rate": 0.00016243654822335025,
"loss": 2.174,
"step": 480
},
{
"epoch": 0.16,
"grad_norm": 0.1552734375,
"learning_rate": 0.00016412859560067684,
"loss": 2.2074,
"step": 485
},
{
"epoch": 0.17,
"grad_norm": 0.150390625,
"learning_rate": 0.0001658206429780034,
"loss": 2.1577,
"step": 490
},
{
"epoch": 0.17,
"grad_norm": 0.1552734375,
"learning_rate": 0.00016751269035532995,
"loss": 2.1827,
"step": 495
},
{
"epoch": 0.17,
"grad_norm": 0.15234375,
"learning_rate": 0.00016920473773265651,
"loss": 2.1913,
"step": 500
},
{
"epoch": 0.17,
"grad_norm": 0.16015625,
"learning_rate": 0.00017089678510998307,
"loss": 2.1873,
"step": 505
},
{
"epoch": 0.17,
"grad_norm": 0.154296875,
"learning_rate": 0.00017258883248730966,
"loss": 2.1936,
"step": 510
},
{
"epoch": 0.17,
"grad_norm": 0.15625,
"learning_rate": 0.00017428087986463622,
"loss": 2.1887,
"step": 515
},
{
"epoch": 0.18,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001759729272419628,
"loss": 2.2072,
"step": 520
},
{
"epoch": 0.18,
"grad_norm": 0.1533203125,
"learning_rate": 0.00017766497461928934,
"loss": 2.2,
"step": 525
},
{
"epoch": 0.18,
"grad_norm": 0.146484375,
"learning_rate": 0.0001793570219966159,
"loss": 2.1929,
"step": 530
},
{
"epoch": 0.18,
"grad_norm": 0.154296875,
"learning_rate": 0.00018104906937394248,
"loss": 2.1655,
"step": 535
},
{
"epoch": 0.18,
"grad_norm": 0.1513671875,
"learning_rate": 0.00018274111675126904,
"loss": 2.191,
"step": 540
},
{
"epoch": 0.18,
"grad_norm": 0.154296875,
"learning_rate": 0.0001844331641285956,
"loss": 2.2017,
"step": 545
},
{
"epoch": 0.19,
"grad_norm": 0.1533203125,
"learning_rate": 0.00018612521150592216,
"loss": 2.1921,
"step": 550
},
{
"epoch": 0.19,
"grad_norm": 0.150390625,
"learning_rate": 0.00018781725888324875,
"loss": 2.1688,
"step": 555
},
{
"epoch": 0.19,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001895093062605753,
"loss": 2.1694,
"step": 560
},
{
"epoch": 0.19,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019120135363790187,
"loss": 2.1757,
"step": 565
},
{
"epoch": 0.19,
"grad_norm": 0.1552734375,
"learning_rate": 0.00019289340101522843,
"loss": 2.2084,
"step": 570
},
{
"epoch": 0.19,
"grad_norm": 0.1533203125,
"learning_rate": 0.000194585448392555,
"loss": 2.1611,
"step": 575
},
{
"epoch": 0.2,
"grad_norm": 0.1484375,
"learning_rate": 0.00019627749576988157,
"loss": 2.1807,
"step": 580
},
{
"epoch": 0.2,
"grad_norm": 0.1513671875,
"learning_rate": 0.00019796954314720813,
"loss": 2.1557,
"step": 585
},
{
"epoch": 0.2,
"grad_norm": 0.14453125,
"learning_rate": 0.0001996615905245347,
"loss": 2.1605,
"step": 590
},
{
"epoch": 0.2,
"grad_norm": 0.15234375,
"learning_rate": 0.00019999972028877317,
"loss": 2.1787,
"step": 595
},
{
"epoch": 0.2,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019999858396459598,
"loss": 2.1948,
"step": 600
},
{
"epoch": 0.2,
"grad_norm": 0.14453125,
"learning_rate": 0.00019999657355544167,
"loss": 2.1525,
"step": 605
},
{
"epoch": 0.21,
"grad_norm": 0.142578125,
"learning_rate": 0.00019999368907888313,
"loss": 2.1583,
"step": 610
},
{
"epoch": 0.21,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001999899305601336,
"loss": 2.2174,
"step": 615
},
{
"epoch": 0.21,
"grad_norm": 0.1484375,
"learning_rate": 0.0001999852980320461,
"loss": 2.1962,
"step": 620
},
{
"epoch": 0.21,
"grad_norm": 0.142578125,
"learning_rate": 0.0001999797915351135,
"loss": 2.167,
"step": 625
},
{
"epoch": 0.21,
"grad_norm": 0.142578125,
"learning_rate": 0.00019997341111746791,
"loss": 2.1438,
"step": 630
},
{
"epoch": 0.22,
"grad_norm": 0.146484375,
"learning_rate": 0.00019996615683488039,
"loss": 2.1757,
"step": 635
},
{
"epoch": 0.22,
"grad_norm": 0.1484375,
"learning_rate": 0.00019995802875076042,
"loss": 2.169,
"step": 640
},
{
"epoch": 0.22,
"grad_norm": 0.14453125,
"learning_rate": 0.0001999490269361554,
"loss": 2.1709,
"step": 645
},
{
"epoch": 0.22,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019993915146974996,
"loss": 2.1642,
"step": 650
},
{
"epoch": 0.22,
"grad_norm": 0.142578125,
"learning_rate": 0.00019992840243786525,
"loss": 2.17,
"step": 655
},
{
"epoch": 0.22,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001999167799344583,
"loss": 2.1677,
"step": 660
},
{
"epoch": 0.23,
"grad_norm": 0.14453125,
"learning_rate": 0.0001999042840611211,
"loss": 2.195,
"step": 665
},
{
"epoch": 0.23,
"grad_norm": 0.142578125,
"learning_rate": 0.00019989091492707975,
"loss": 2.1986,
"step": 670
},
{
"epoch": 0.23,
"grad_norm": 0.150390625,
"learning_rate": 0.0001998766726491935,
"loss": 2.1513,
"step": 675
},
{
"epoch": 0.23,
"grad_norm": 0.142578125,
"learning_rate": 0.00019986155735195372,
"loss": 2.1754,
"step": 680
},
{
"epoch": 0.23,
"grad_norm": 0.1484375,
"learning_rate": 0.0001998455691674828,
"loss": 2.1832,
"step": 685
},
{
"epoch": 0.23,
"grad_norm": 0.138671875,
"learning_rate": 0.00019982870823553308,
"loss": 2.1831,
"step": 690
},
{
"epoch": 0.24,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019981097470348548,
"loss": 2.1757,
"step": 695
},
{
"epoch": 0.24,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019979236872634838,
"loss": 2.1822,
"step": 700
},
{
"epoch": 0.24,
"grad_norm": 0.14453125,
"learning_rate": 0.0001997728904667561,
"loss": 2.1929,
"step": 705
},
{
"epoch": 0.24,
"grad_norm": 0.140625,
"learning_rate": 0.00019975254009496762,
"loss": 2.156,
"step": 710
},
{
"epoch": 0.24,
"grad_norm": 0.140625,
"learning_rate": 0.00019973131778886497,
"loss": 2.1597,
"step": 715
},
{
"epoch": 0.24,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019970922373395176,
"loss": 2.1798,
"step": 720
},
{
"epoch": 0.25,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019968625812335158,
"loss": 2.1946,
"step": 725
},
{
"epoch": 0.25,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019966242115780617,
"loss": 2.1836,
"step": 730
},
{
"epoch": 0.25,
"grad_norm": 0.1484375,
"learning_rate": 0.00019963771304567387,
"loss": 2.1733,
"step": 735
},
{
"epoch": 0.25,
"grad_norm": 0.146484375,
"learning_rate": 0.00019961213400292762,
"loss": 2.161,
"step": 740
},
{
"epoch": 0.25,
"grad_norm": 0.1513671875,
"learning_rate": 0.00019958568425315314,
"loss": 2.188,
"step": 745
},
{
"epoch": 0.25,
"grad_norm": 0.146484375,
"learning_rate": 0.000199558364027547,
"loss": 2.1381,
"step": 750
},
{
"epoch": 0.26,
"grad_norm": 0.1533203125,
"learning_rate": 0.00019953017356491457,
"loss": 2.1973,
"step": 755
},
{
"epoch": 0.26,
"grad_norm": 0.150390625,
"learning_rate": 0.0001995011131116679,
"loss": 2.1808,
"step": 760
},
{
"epoch": 0.26,
"grad_norm": 0.1513671875,
"learning_rate": 0.00019947118292182377,
"loss": 2.1649,
"step": 765
},
{
"epoch": 0.26,
"grad_norm": 0.146484375,
"learning_rate": 0.00019944038325700103,
"loss": 2.171,
"step": 770
},
{
"epoch": 0.26,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019940871438641882,
"loss": 2.171,
"step": 775
},
{
"epoch": 0.26,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019937617658689384,
"loss": 2.1943,
"step": 780
},
{
"epoch": 0.27,
"grad_norm": 0.138671875,
"learning_rate": 0.0001993427701428382,
"loss": 2.1225,
"step": 785
},
{
"epoch": 0.27,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001993084953462567,
"loss": 2.1786,
"step": 790
},
{
"epoch": 0.27,
"grad_norm": 0.142578125,
"learning_rate": 0.00019927335249674447,
"loss": 2.1775,
"step": 795
},
{
"epoch": 0.27,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019923734190148418,
"loss": 2.1732,
"step": 800
},
{
"epoch": 0.27,
"grad_norm": 0.14453125,
"learning_rate": 0.0001992004638752435,
"loss": 2.1627,
"step": 805
},
{
"epoch": 0.27,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001991627187403723,
"loss": 2.1665,
"step": 810
},
{
"epoch": 0.28,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001991241068267998,
"loss": 2.1873,
"step": 815
},
{
"epoch": 0.28,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019908462847203172,
"loss": 2.1639,
"step": 820
},
{
"epoch": 0.28,
"grad_norm": 0.1416015625,
"learning_rate": 0.0001990442840211473,
"loss": 2.1677,
"step": 825
},
{
"epoch": 0.28,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019900307382679638,
"loss": 2.1378,
"step": 830
},
{
"epoch": 0.28,
"grad_norm": 0.1376953125,
"learning_rate": 0.00019896099824919604,
"loss": 2.1726,
"step": 835
},
{
"epoch": 0.28,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019891805765612794,
"loss": 2.1589,
"step": 840
},
{
"epoch": 0.29,
"grad_norm": 0.14453125,
"learning_rate": 0.0001988742524229346,
"loss": 2.1581,
"step": 845
},
{
"epoch": 0.29,
"grad_norm": 0.142578125,
"learning_rate": 0.00019882958293251636,
"loss": 2.1802,
"step": 850
},
{
"epoch": 0.29,
"grad_norm": 0.138671875,
"learning_rate": 0.00019878404957532814,
"loss": 2.1449,
"step": 855
},
{
"epoch": 0.29,
"grad_norm": 0.142578125,
"learning_rate": 0.00019873765274937578,
"loss": 2.1585,
"step": 860
},
{
"epoch": 0.29,
"grad_norm": 0.14453125,
"learning_rate": 0.00019869039286021271,
"loss": 2.1749,
"step": 865
},
{
"epoch": 0.29,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019864227032093633,
"loss": 2.1607,
"step": 870
},
{
"epoch": 0.3,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019859328555218455,
"loss": 2.1621,
"step": 875
},
{
"epoch": 0.3,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001985434389821319,
"loss": 2.1609,
"step": 880
},
{
"epoch": 0.3,
"grad_norm": 0.142578125,
"learning_rate": 0.00019849273104648592,
"loss": 2.1601,
"step": 885
},
{
"epoch": 0.3,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019844116218848334,
"loss": 2.1498,
"step": 890
},
{
"epoch": 0.3,
"grad_norm": 0.142578125,
"learning_rate": 0.0001983887328588862,
"loss": 2.1896,
"step": 895
},
{
"epoch": 0.3,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019833544351597788,
"loss": 2.1391,
"step": 900
},
{
"epoch": 0.31,
"grad_norm": 0.1484375,
"learning_rate": 0.0001982812946255591,
"loss": 2.149,
"step": 905
},
{
"epoch": 0.31,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001982262866609439,
"loss": 2.1336,
"step": 910
},
{
"epoch": 0.31,
"grad_norm": 0.14453125,
"learning_rate": 0.00019817042010295544,
"loss": 2.1388,
"step": 915
},
{
"epoch": 0.31,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019811369543992183,
"loss": 2.1662,
"step": 920
},
{
"epoch": 0.31,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001980561131676718,
"loss": 2.1467,
"step": 925
},
{
"epoch": 0.31,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001979976737895305,
"loss": 2.1824,
"step": 930
},
{
"epoch": 0.32,
"grad_norm": 0.14453125,
"learning_rate": 0.00019793837781631506,
"loss": 2.1764,
"step": 935
},
{
"epoch": 0.32,
"grad_norm": 0.142578125,
"learning_rate": 0.0001978782257663299,
"loss": 2.1521,
"step": 940
},
{
"epoch": 0.32,
"grad_norm": 0.146484375,
"learning_rate": 0.00019781721816536257,
"loss": 2.1694,
"step": 945
},
{
"epoch": 0.32,
"grad_norm": 0.14453125,
"learning_rate": 0.00019775535554667886,
"loss": 2.178,
"step": 950
},
{
"epoch": 0.32,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019769263845101828,
"loss": 2.1895,
"step": 955
},
{
"epoch": 0.33,
"grad_norm": 0.1396484375,
"learning_rate": 0.00019762906742658935,
"loss": 2.1464,
"step": 960
},
{
"epoch": 0.33,
"grad_norm": 0.1484375,
"learning_rate": 0.00019756464302906465,
"loss": 2.1602,
"step": 965
},
{
"epoch": 0.33,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001974993658215762,
"loss": 2.161,
"step": 970
},
{
"epoch": 0.33,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019743323637471028,
"loss": 2.1644,
"step": 975
},
{
"epoch": 0.33,
"grad_norm": 0.140625,
"learning_rate": 0.00019736625526650269,
"loss": 2.1672,
"step": 980
},
{
"epoch": 0.33,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001972984230824335,
"loss": 2.2011,
"step": 985
},
{
"epoch": 0.34,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019722974041542203,
"loss": 2.1562,
"step": 990
},
{
"epoch": 0.34,
"grad_norm": 0.138671875,
"learning_rate": 0.00019716020786582169,
"loss": 2.1351,
"step": 995
},
{
"epoch": 0.34,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001970898260414146,
"loss": 2.1662,
"step": 1000
},
{
"epoch": 0.34,
"grad_norm": 0.1572265625,
"learning_rate": 0.00019701859555740648,
"loss": 2.1601,
"step": 1005
},
{
"epoch": 0.34,
"grad_norm": 0.1513671875,
"learning_rate": 0.00019694651703642104,
"loss": 2.1325,
"step": 1010
},
{
"epoch": 0.34,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001968735911084948,
"loss": 2.1697,
"step": 1015
},
{
"epoch": 0.35,
"grad_norm": 0.142578125,
"learning_rate": 0.0001967998184110713,
"loss": 2.138,
"step": 1020
},
{
"epoch": 0.35,
"grad_norm": 0.1484375,
"learning_rate": 0.00019672519958899583,
"loss": 2.1444,
"step": 1025
},
{
"epoch": 0.35,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019664973529450946,
"loss": 2.1684,
"step": 1030
},
{
"epoch": 0.35,
"grad_norm": 0.146484375,
"learning_rate": 0.00019657342618724358,
"loss": 2.1495,
"step": 1035
},
{
"epoch": 0.35,
"grad_norm": 0.146484375,
"learning_rate": 0.00019649627293421413,
"loss": 2.1181,
"step": 1040
},
{
"epoch": 0.35,
"grad_norm": 0.142578125,
"learning_rate": 0.00019641827620981564,
"loss": 2.1262,
"step": 1045
},
{
"epoch": 0.36,
"grad_norm": 0.15234375,
"learning_rate": 0.0001963394366958154,
"loss": 2.1477,
"step": 1050
},
{
"epoch": 0.36,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019625975508134755,
"loss": 2.1598,
"step": 1055
},
{
"epoch": 0.36,
"grad_norm": 0.142578125,
"learning_rate": 0.00019617923206290692,
"loss": 2.165,
"step": 1060
},
{
"epoch": 0.36,
"grad_norm": 0.150390625,
"learning_rate": 0.00019609786834434313,
"loss": 2.1672,
"step": 1065
},
{
"epoch": 0.36,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019601566463685425,
"loss": 2.1639,
"step": 1070
},
{
"epoch": 0.36,
"grad_norm": 0.14453125,
"learning_rate": 0.00019593262165898076,
"loss": 2.1579,
"step": 1075
},
{
"epoch": 0.37,
"grad_norm": 0.1484375,
"learning_rate": 0.0001958487401365991,
"loss": 2.1628,
"step": 1080
},
{
"epoch": 0.37,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019576402080291545,
"loss": 2.1964,
"step": 1085
},
{
"epoch": 0.37,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019567846439845927,
"loss": 2.1792,
"step": 1090
},
{
"epoch": 0.37,
"grad_norm": 0.146484375,
"learning_rate": 0.00019559207167107684,
"loss": 2.1703,
"step": 1095
},
{
"epoch": 0.37,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019550484337592464,
"loss": 2.1512,
"step": 1100
},
{
"epoch": 0.37,
"grad_norm": 0.142578125,
"learning_rate": 0.00019541678027546296,
"loss": 2.1653,
"step": 1105
},
{
"epoch": 0.38,
"grad_norm": 0.140625,
"learning_rate": 0.00019532788313944904,
"loss": 2.1729,
"step": 1110
},
{
"epoch": 0.38,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019523815274493031,
"loss": 2.1702,
"step": 1115
},
{
"epoch": 0.38,
"grad_norm": 0.1533203125,
"learning_rate": 0.00019514758987623784,
"loss": 2.1723,
"step": 1120
},
{
"epoch": 0.38,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019505619532497926,
"loss": 2.174,
"step": 1125
},
{
"epoch": 0.38,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019496396989003193,
"loss": 2.1431,
"step": 1130
},
{
"epoch": 0.38,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019487091437753594,
"loss": 2.1581,
"step": 1135
},
{
"epoch": 0.39,
"grad_norm": 0.14453125,
"learning_rate": 0.00019477702960088702,
"loss": 2.1764,
"step": 1140
},
{
"epoch": 0.39,
"grad_norm": 0.146484375,
"learning_rate": 0.0001946823163807296,
"loss": 2.1656,
"step": 1145
},
{
"epoch": 0.39,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019458677554494932,
"loss": 2.1645,
"step": 1150
},
{
"epoch": 0.39,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001944904079286662,
"loss": 2.1502,
"step": 1155
},
{
"epoch": 0.39,
"grad_norm": 0.14453125,
"learning_rate": 0.00019439321437422695,
"loss": 2.1563,
"step": 1160
},
{
"epoch": 0.39,
"grad_norm": 0.146484375,
"learning_rate": 0.00019429519573119794,
"loss": 2.1559,
"step": 1165
},
{
"epoch": 0.4,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019419635285635746,
"loss": 2.1708,
"step": 1170
},
{
"epoch": 0.4,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001940966866136885,
"loss": 2.1393,
"step": 1175
},
{
"epoch": 0.4,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019399619787437104,
"loss": 2.1421,
"step": 1180
},
{
"epoch": 0.4,
"grad_norm": 0.146484375,
"learning_rate": 0.0001938948875167745,
"loss": 2.1468,
"step": 1185
},
{
"epoch": 0.4,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019379275642645002,
"loss": 2.1617,
"step": 1190
},
{
"epoch": 0.4,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001936898054961228,
"loss": 2.1828,
"step": 1195
},
{
"epoch": 0.41,
"grad_norm": 0.1484375,
"learning_rate": 0.00019358603562568416,
"loss": 2.1236,
"step": 1200
},
{
"epoch": 0.41,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001934814477221838,
"loss": 2.1469,
"step": 1205
},
{
"epoch": 0.41,
"grad_norm": 0.1416015625,
"learning_rate": 0.0001933760426998218,
"loss": 2.1827,
"step": 1210
},
{
"epoch": 0.41,
"grad_norm": 0.14453125,
"learning_rate": 0.0001932698214799407,
"loss": 2.1483,
"step": 1215
},
{
"epoch": 0.41,
"grad_norm": 0.14453125,
"learning_rate": 0.0001931627849910174,
"loss": 2.1429,
"step": 1220
},
{
"epoch": 0.41,
"grad_norm": 0.1435546875,
"learning_rate": 0.00019305493416865493,
"loss": 2.128,
"step": 1225
},
{
"epoch": 0.42,
"grad_norm": 0.14453125,
"learning_rate": 0.00019294626995557457,
"loss": 2.1609,
"step": 1230
},
{
"epoch": 0.42,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019283679330160726,
"loss": 2.1455,
"step": 1235
},
{
"epoch": 0.42,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001927265051636856,
"loss": 2.1553,
"step": 1240
},
{
"epoch": 0.42,
"grad_norm": 0.14453125,
"learning_rate": 0.00019261540650583522,
"loss": 2.1856,
"step": 1245
},
{
"epoch": 0.42,
"grad_norm": 0.142578125,
"learning_rate": 0.00019250349829916661,
"loss": 2.1403,
"step": 1250
},
{
"epoch": 0.43,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001923907815218664,
"loss": 2.1798,
"step": 1255
},
{
"epoch": 0.43,
"grad_norm": 0.1416015625,
"learning_rate": 0.00019227725715918897,
"loss": 2.137,
"step": 1260
},
{
"epoch": 0.43,
"grad_norm": 0.14453125,
"learning_rate": 0.00019216292620344777,
"loss": 2.1738,
"step": 1265
},
{
"epoch": 0.43,
"grad_norm": 0.1533203125,
"learning_rate": 0.00019204778965400667,
"loss": 2.1471,
"step": 1270
},
{
"epoch": 0.43,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001919318485172712,
"loss": 2.1756,
"step": 1275
},
{
"epoch": 0.43,
"grad_norm": 0.15234375,
"learning_rate": 0.00019181510380667977,
"loss": 2.165,
"step": 1280
},
{
"epoch": 0.44,
"grad_norm": 0.14453125,
"learning_rate": 0.0001916975565426948,
"loss": 2.17,
"step": 1285
},
{
"epoch": 0.44,
"grad_norm": 0.1455078125,
"learning_rate": 0.00019157920775279383,
"loss": 2.1469,
"step": 1290
},
{
"epoch": 0.44,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001914600584714605,
"loss": 2.1457,
"step": 1295
},
{
"epoch": 0.44,
"grad_norm": 0.154296875,
"learning_rate": 0.0001913401097401755,
"loss": 2.1362,
"step": 1300
},
{
"epoch": 0.44,
"grad_norm": 0.1474609375,
"learning_rate": 0.00019121936260740752,
"loss": 2.1762,
"step": 1305
},
{
"epoch": 0.44,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001910978181286041,
"loss": 2.161,
"step": 1310
},
{
"epoch": 0.45,
"grad_norm": 0.146484375,
"learning_rate": 0.00019097547736618228,
"loss": 2.1565,
"step": 1315
},
{
"epoch": 0.45,
"grad_norm": 0.146484375,
"learning_rate": 0.0001908523413895194,
"loss": 2.139,
"step": 1320
},
{
"epoch": 0.45,
"grad_norm": 0.14453125,
"learning_rate": 0.0001907284112749438,
"loss": 2.1392,
"step": 1325
},
{
"epoch": 0.45,
"grad_norm": 0.146484375,
"learning_rate": 0.00019060368810572539,
"loss": 2.1563,
"step": 1330
},
{
"epoch": 0.45,
"grad_norm": 0.150390625,
"learning_rate": 0.00019047817297206598,
"loss": 2.1203,
"step": 1335
},
{
"epoch": 0.45,
"grad_norm": 0.1513671875,
"learning_rate": 0.00019035186697109011,
"loss": 2.1534,
"step": 1340
},
{
"epoch": 0.46,
"grad_norm": 0.14453125,
"learning_rate": 0.0001902247712068352,
"loss": 2.1504,
"step": 1345
},
{
"epoch": 0.46,
"grad_norm": 0.1435546875,
"learning_rate": 0.0001900968867902419,
"loss": 2.1798,
"step": 1350
},
{
"epoch": 0.46,
"grad_norm": 0.150390625,
"learning_rate": 0.0001899682148391446,
"loss": 2.1541,
"step": 1355
},
{
"epoch": 0.46,
"grad_norm": 0.1513671875,
"learning_rate": 0.00018983875647826136,
"loss": 2.1286,
"step": 1360
},
{
"epoch": 0.46,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018970851283918428,
"loss": 2.1471,
"step": 1365
},
{
"epoch": 0.46,
"grad_norm": 0.1435546875,
"learning_rate": 0.00018957748506036957,
"loss": 2.137,
"step": 1370
},
{
"epoch": 0.47,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018944567428712765,
"loss": 2.1514,
"step": 1375
},
{
"epoch": 0.47,
"grad_norm": 0.1484375,
"learning_rate": 0.0001893130816716129,
"loss": 2.1668,
"step": 1380
},
{
"epoch": 0.47,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018917970837281392,
"loss": 2.1723,
"step": 1385
},
{
"epoch": 0.47,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018904555555654317,
"loss": 2.1846,
"step": 1390
},
{
"epoch": 0.47,
"grad_norm": 0.150390625,
"learning_rate": 0.0001889106243954269,
"loss": 2.1571,
"step": 1395
},
{
"epoch": 0.47,
"grad_norm": 0.1484375,
"learning_rate": 0.00018877491606889476,
"loss": 2.1311,
"step": 1400
},
{
"epoch": 0.48,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001886384317631697,
"loss": 2.1693,
"step": 1405
},
{
"epoch": 0.48,
"grad_norm": 0.146484375,
"learning_rate": 0.00018850117267125738,
"loss": 2.1773,
"step": 1410
},
{
"epoch": 0.48,
"grad_norm": 0.14453125,
"learning_rate": 0.00018836313999293593,
"loss": 2.1599,
"step": 1415
},
{
"epoch": 0.48,
"grad_norm": 0.1435546875,
"learning_rate": 0.00018822433493474532,
"loss": 2.1282,
"step": 1420
},
{
"epoch": 0.48,
"grad_norm": 0.1533203125,
"learning_rate": 0.0001880847587099769,
"loss": 2.1315,
"step": 1425
},
{
"epoch": 0.48,
"grad_norm": 0.146484375,
"learning_rate": 0.00018794441253866274,
"loss": 2.1701,
"step": 1430
},
{
"epoch": 0.49,
"grad_norm": 0.150390625,
"learning_rate": 0.00018780329764756505,
"loss": 2.15,
"step": 1435
},
{
"epoch": 0.49,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018766141527016533,
"loss": 2.1334,
"step": 1440
},
{
"epoch": 0.49,
"grad_norm": 0.14453125,
"learning_rate": 0.00018751876664665367,
"loss": 2.1351,
"step": 1445
},
{
"epoch": 0.49,
"grad_norm": 0.1513671875,
"learning_rate": 0.00018737535302391795,
"loss": 2.1777,
"step": 1450
},
{
"epoch": 0.49,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018723117565553284,
"loss": 2.1515,
"step": 1455
},
{
"epoch": 0.49,
"grad_norm": 0.150390625,
"learning_rate": 0.00018708623580174889,
"loss": 2.1432,
"step": 1460
},
{
"epoch": 0.5,
"grad_norm": 0.1484375,
"learning_rate": 0.00018694053472948156,
"loss": 2.1384,
"step": 1465
},
{
"epoch": 0.5,
"grad_norm": 0.1484375,
"learning_rate": 0.00018679407371230002,
"loss": 2.1572,
"step": 1470
},
{
"epoch": 0.5,
"grad_norm": 0.1484375,
"learning_rate": 0.00018664685403041619,
"loss": 2.1624,
"step": 1475
},
{
"epoch": 0.5,
"grad_norm": 0.142578125,
"learning_rate": 0.0001864988769706734,
"loss": 2.1621,
"step": 1480
},
{
"epoch": 0.5,
"grad_norm": 0.146484375,
"learning_rate": 0.0001863501438265352,
"loss": 2.1513,
"step": 1485
},
{
"epoch": 0.5,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018620065589807413,
"loss": 2.1648,
"step": 1490
},
{
"epoch": 0.51,
"grad_norm": 0.1484375,
"learning_rate": 0.00018605041449196012,
"loss": 2.18,
"step": 1495
},
{
"epoch": 0.51,
"grad_norm": 0.14453125,
"learning_rate": 0.00018589942092144942,
"loss": 2.1806,
"step": 1500
},
{
"epoch": 0.51,
"grad_norm": 0.1552734375,
"learning_rate": 0.00018574767650637278,
"loss": 2.1141,
"step": 1505
},
{
"epoch": 0.51,
"grad_norm": 0.1484375,
"learning_rate": 0.0001855951825731241,
"loss": 2.1332,
"step": 1510
},
{
"epoch": 0.51,
"grad_norm": 0.1435546875,
"learning_rate": 0.00018544194045464886,
"loss": 2.1366,
"step": 1515
},
{
"epoch": 0.51,
"grad_norm": 0.146484375,
"learning_rate": 0.00018528795149043236,
"loss": 2.1581,
"step": 1520
},
{
"epoch": 0.52,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018513321702648807,
"loss": 2.1433,
"step": 1525
},
{
"epoch": 0.52,
"grad_norm": 0.1484375,
"learning_rate": 0.0001849777384153458,
"loss": 2.1543,
"step": 1530
},
{
"epoch": 0.52,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018482151701604003,
"loss": 2.1629,
"step": 1535
},
{
"epoch": 0.52,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018466455419409786,
"loss": 2.1542,
"step": 1540
},
{
"epoch": 0.52,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001845068513215271,
"loss": 2.143,
"step": 1545
},
{
"epoch": 0.52,
"grad_norm": 0.146484375,
"learning_rate": 0.00018434840977680453,
"loss": 2.1375,
"step": 1550
},
{
"epoch": 0.53,
"grad_norm": 0.146484375,
"learning_rate": 0.00018418923094486338,
"loss": 2.1539,
"step": 1555
},
{
"epoch": 0.53,
"grad_norm": 0.146484375,
"learning_rate": 0.00018402931621708165,
"loss": 2.1611,
"step": 1560
},
{
"epoch": 0.53,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018386866699126973,
"loss": 2.1789,
"step": 1565
},
{
"epoch": 0.53,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018370728467165828,
"loss": 2.1419,
"step": 1570
},
{
"epoch": 0.53,
"grad_norm": 0.146484375,
"learning_rate": 0.0001835451706688859,
"loss": 2.1748,
"step": 1575
},
{
"epoch": 0.54,
"grad_norm": 0.1455078125,
"learning_rate": 0.0001833823263999867,
"loss": 2.1267,
"step": 1580
},
{
"epoch": 0.54,
"grad_norm": 0.150390625,
"learning_rate": 0.00018321875328837828,
"loss": 2.1319,
"step": 1585
},
{
"epoch": 0.54,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018305445276384875,
"loss": 2.1438,
"step": 1590
},
{
"epoch": 0.54,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018288942626254473,
"loss": 2.1572,
"step": 1595
},
{
"epoch": 0.54,
"grad_norm": 0.150390625,
"learning_rate": 0.00018272367522695844,
"loss": 2.1463,
"step": 1600
},
{
"epoch": 0.54,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018255720110591533,
"loss": 2.1568,
"step": 1605
},
{
"epoch": 0.55,
"grad_norm": 0.150390625,
"learning_rate": 0.0001823900053545613,
"loss": 2.1392,
"step": 1610
},
{
"epoch": 0.55,
"grad_norm": 0.154296875,
"learning_rate": 0.00018222208943434999,
"loss": 2.1496,
"step": 1615
},
{
"epoch": 0.55,
"grad_norm": 0.150390625,
"learning_rate": 0.00018205345481302998,
"loss": 2.1474,
"step": 1620
},
{
"epoch": 0.55,
"grad_norm": 0.1533203125,
"learning_rate": 0.0001818841029646321,
"loss": 2.153,
"step": 1625
},
{
"epoch": 0.55,
"grad_norm": 0.1513671875,
"learning_rate": 0.00018171403536945628,
"loss": 2.1697,
"step": 1630
},
{
"epoch": 0.55,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018154325351405897,
"loss": 2.185,
"step": 1635
},
{
"epoch": 0.56,
"grad_norm": 0.1513671875,
"learning_rate": 0.00018137175889123978,
"loss": 2.1443,
"step": 1640
},
{
"epoch": 0.56,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001811995530000287,
"loss": 2.1402,
"step": 1645
},
{
"epoch": 0.56,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018102663734567283,
"loss": 2.1414,
"step": 1650
},
{
"epoch": 0.56,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001808530134396234,
"loss": 2.1552,
"step": 1655
},
{
"epoch": 0.56,
"grad_norm": 0.1474609375,
"learning_rate": 0.00018067868279952236,
"loss": 2.1374,
"step": 1660
},
{
"epoch": 0.56,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001805036469491892,
"loss": 2.1671,
"step": 1665
},
{
"epoch": 0.57,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018032790741860763,
"loss": 2.1369,
"step": 1670
},
{
"epoch": 0.57,
"grad_norm": 0.150390625,
"learning_rate": 0.00018015146574391233,
"loss": 2.1283,
"step": 1675
},
{
"epoch": 0.57,
"grad_norm": 0.1484375,
"learning_rate": 0.00017997432346737524,
"loss": 2.1504,
"step": 1680
},
{
"epoch": 0.57,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017979648213739232,
"loss": 2.1338,
"step": 1685
},
{
"epoch": 0.57,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017961794330846994,
"loss": 2.1555,
"step": 1690
},
{
"epoch": 0.57,
"grad_norm": 0.1513671875,
"learning_rate": 0.00017943870854121124,
"loss": 2.1543,
"step": 1695
},
{
"epoch": 0.58,
"grad_norm": 0.15234375,
"learning_rate": 0.0001792587794023027,
"loss": 2.1428,
"step": 1700
},
{
"epoch": 0.58,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017907815746450004,
"loss": 2.1174,
"step": 1705
},
{
"epoch": 0.58,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001788968443066149,
"loss": 2.1251,
"step": 1710
},
{
"epoch": 0.58,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017871484151350077,
"loss": 2.1073,
"step": 1715
},
{
"epoch": 0.58,
"grad_norm": 0.1484375,
"learning_rate": 0.00017853215067603926,
"loss": 2.1918,
"step": 1720
},
{
"epoch": 0.58,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017834877339112612,
"loss": 2.126,
"step": 1725
},
{
"epoch": 0.59,
"grad_norm": 0.150390625,
"learning_rate": 0.0001781647112616574,
"loss": 2.1565,
"step": 1730
},
{
"epoch": 0.59,
"grad_norm": 0.146484375,
"learning_rate": 0.0001779799658965153,
"loss": 2.137,
"step": 1735
},
{
"epoch": 0.59,
"grad_norm": 0.146484375,
"learning_rate": 0.00017779453891055412,
"loss": 2.1713,
"step": 1740
},
{
"epoch": 0.59,
"grad_norm": 0.154296875,
"learning_rate": 0.00017760843192458626,
"loss": 2.1457,
"step": 1745
},
{
"epoch": 0.59,
"grad_norm": 0.150390625,
"learning_rate": 0.00017742164656536798,
"loss": 2.1576,
"step": 1750
},
{
"epoch": 0.59,
"grad_norm": 0.154296875,
"learning_rate": 0.00017723418446558516,
"loss": 2.1461,
"step": 1755
},
{
"epoch": 0.6,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017704604726383904,
"loss": 2.1383,
"step": 1760
},
{
"epoch": 0.6,
"grad_norm": 0.1484375,
"learning_rate": 0.00017685723660463193,
"loss": 2.1299,
"step": 1765
},
{
"epoch": 0.6,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017666775413835282,
"loss": 2.1461,
"step": 1770
},
{
"epoch": 0.6,
"grad_norm": 0.150390625,
"learning_rate": 0.0001764776015212629,
"loss": 2.1075,
"step": 1775
},
{
"epoch": 0.6,
"grad_norm": 0.1484375,
"learning_rate": 0.0001762867804154812,
"loss": 2.1439,
"step": 1780
},
{
"epoch": 0.6,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017609529248896997,
"loss": 2.1802,
"step": 1785
},
{
"epoch": 0.61,
"grad_norm": 0.1513671875,
"learning_rate": 0.00017590313941552002,
"loss": 2.1746,
"step": 1790
},
{
"epoch": 0.61,
"grad_norm": 0.1513671875,
"learning_rate": 0.00017571032287473642,
"loss": 2.1198,
"step": 1795
},
{
"epoch": 0.61,
"grad_norm": 0.15234375,
"learning_rate": 0.00017551684455202336,
"loss": 2.128,
"step": 1800
},
{
"epoch": 0.61,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017532270613856976,
"loss": 2.153,
"step": 1805
},
{
"epoch": 0.61,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017512790933133437,
"loss": 2.1273,
"step": 1810
},
{
"epoch": 0.61,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001749324558330309,
"loss": 2.1527,
"step": 1815
},
{
"epoch": 0.62,
"grad_norm": 0.146484375,
"learning_rate": 0.0001747363473521132,
"loss": 2.1482,
"step": 1820
},
{
"epoch": 0.62,
"grad_norm": 0.1533203125,
"learning_rate": 0.00017453958560276038,
"loss": 2.1475,
"step": 1825
},
{
"epoch": 0.62,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017434217230486164,
"loss": 2.1606,
"step": 1830
},
{
"epoch": 0.62,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001741441091840014,
"loss": 2.1657,
"step": 1835
},
{
"epoch": 0.62,
"grad_norm": 0.1533203125,
"learning_rate": 0.00017394539797144413,
"loss": 2.1167,
"step": 1840
},
{
"epoch": 0.62,
"grad_norm": 0.150390625,
"learning_rate": 0.00017374604040411935,
"loss": 2.1317,
"step": 1845
},
{
"epoch": 0.63,
"grad_norm": 0.150390625,
"learning_rate": 0.00017354603822460621,
"loss": 2.1323,
"step": 1850
},
{
"epoch": 0.63,
"grad_norm": 0.154296875,
"learning_rate": 0.00017334539318111856,
"loss": 2.1418,
"step": 1855
},
{
"epoch": 0.63,
"grad_norm": 0.150390625,
"learning_rate": 0.00017314410702748932,
"loss": 2.1342,
"step": 1860
},
{
"epoch": 0.63,
"grad_norm": 0.1533203125,
"learning_rate": 0.00017294218152315546,
"loss": 2.1471,
"step": 1865
},
{
"epoch": 0.63,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017273961843314252,
"loss": 2.1469,
"step": 1870
},
{
"epoch": 0.64,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001725364195280491,
"loss": 2.1527,
"step": 1875
},
{
"epoch": 0.64,
"grad_norm": 0.1533203125,
"learning_rate": 0.00017233258658403138,
"loss": 2.1461,
"step": 1880
},
{
"epoch": 0.64,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001721281213827878,
"loss": 2.1401,
"step": 1885
},
{
"epoch": 0.64,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017192302571154331,
"loss": 2.1612,
"step": 1890
},
{
"epoch": 0.64,
"grad_norm": 0.15234375,
"learning_rate": 0.00017171730136303364,
"loss": 2.1487,
"step": 1895
},
{
"epoch": 0.64,
"grad_norm": 0.1474609375,
"learning_rate": 0.00017151095013548994,
"loss": 2.1444,
"step": 1900
},
{
"epoch": 0.65,
"grad_norm": 0.1484375,
"learning_rate": 0.00017130397383262284,
"loss": 2.1251,
"step": 1905
},
{
"epoch": 0.65,
"grad_norm": 0.146484375,
"learning_rate": 0.0001710963742636067,
"loss": 2.1352,
"step": 1910
},
{
"epoch": 0.65,
"grad_norm": 0.150390625,
"learning_rate": 0.00017088815324306392,
"loss": 2.1285,
"step": 1915
},
{
"epoch": 0.65,
"grad_norm": 0.1552734375,
"learning_rate": 0.00017067931259104885,
"loss": 2.1599,
"step": 1920
},
{
"epoch": 0.65,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017046985413303215,
"loss": 2.1218,
"step": 1925
},
{
"epoch": 0.65,
"grad_norm": 0.1494140625,
"learning_rate": 0.00017025977969988465,
"loss": 2.1374,
"step": 1930
},
{
"epoch": 0.66,
"grad_norm": 0.1513671875,
"learning_rate": 0.00017004909112786144,
"loss": 2.1299,
"step": 1935
},
{
"epoch": 0.66,
"grad_norm": 0.1533203125,
"learning_rate": 0.0001698377902585857,
"loss": 2.1378,
"step": 1940
},
{
"epoch": 0.66,
"grad_norm": 0.1494140625,
"learning_rate": 0.00016962587893903276,
"loss": 2.1501,
"step": 1945
},
{
"epoch": 0.66,
"grad_norm": 0.150390625,
"learning_rate": 0.0001694133590215139,
"loss": 2.1612,
"step": 1950
},
{
"epoch": 0.66,
"grad_norm": 0.1474609375,
"learning_rate": 0.00016920023236366002,
"loss": 2.1134,
"step": 1955
},
{
"epoch": 0.66,
"grad_norm": 0.146484375,
"learning_rate": 0.00016898650082840572,
"loss": 2.1339,
"step": 1960
},
{
"epoch": 0.67,
"grad_norm": 0.1474609375,
"learning_rate": 0.00016877216628397257,
"loss": 2.1258,
"step": 1965
},
{
"epoch": 0.67,
"grad_norm": 0.15234375,
"learning_rate": 0.0001685572306038532,
"loss": 2.1085,
"step": 1970
},
{
"epoch": 0.67,
"grad_norm": 0.1572265625,
"learning_rate": 0.0001683416956667947,
"loss": 2.1549,
"step": 1975
},
{
"epoch": 0.67,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001681255633567823,
"loss": 2.1573,
"step": 1980
},
{
"epoch": 0.67,
"grad_norm": 0.150390625,
"learning_rate": 0.00016790883556302272,
"loss": 2.1519,
"step": 1985
},
{
"epoch": 0.67,
"grad_norm": 0.1572265625,
"learning_rate": 0.00016769151417992791,
"loss": 2.1426,
"step": 1990
},
{
"epoch": 0.68,
"grad_norm": 0.1513671875,
"learning_rate": 0.00016747360110709838,
"loss": 2.1406,
"step": 1995
},
{
"epoch": 0.68,
"grad_norm": 0.1552734375,
"learning_rate": 0.00016725509824930645,
"loss": 2.1495,
"step": 2000
},
{
"epoch": 0.68,
"grad_norm": 0.154296875,
"learning_rate": 0.0001670360075164799,
"loss": 2.1589,
"step": 2005
},
{
"epoch": 0.68,
"grad_norm": 0.150390625,
"learning_rate": 0.00016681633082368498,
"loss": 2.1569,
"step": 2010
},
{
"epoch": 0.68,
"grad_norm": 0.150390625,
"learning_rate": 0.00016659607009110984,
"loss": 2.1429,
"step": 2015
},
{
"epoch": 0.68,
"grad_norm": 0.15234375,
"learning_rate": 0.00016637522724404774,
"loss": 2.1547,
"step": 2020
},
{
"epoch": 0.69,
"grad_norm": 0.150390625,
"learning_rate": 0.00016615380421288018,
"loss": 2.1187,
"step": 2025
},
{
"epoch": 0.69,
"grad_norm": 0.1474609375,
"learning_rate": 0.00016593180293306001,
"loss": 2.1405,
"step": 2030
},
{
"epoch": 0.69,
"grad_norm": 0.15234375,
"learning_rate": 0.0001657092253450945,
"loss": 2.1324,
"step": 2035
},
{
"epoch": 0.69,
"grad_norm": 0.1484375,
"learning_rate": 0.00016548607339452853,
"loss": 2.1106,
"step": 2040
},
{
"epoch": 0.69,
"grad_norm": 0.150390625,
"learning_rate": 0.00016526234903192733,
"loss": 2.1533,
"step": 2045
},
{
"epoch": 0.69,
"grad_norm": 0.1484375,
"learning_rate": 0.00016503805421285968,
"loss": 2.1514,
"step": 2050
},
{
"epoch": 0.7,
"grad_norm": 0.15234375,
"learning_rate": 0.00016481319089788063,
"loss": 2.1282,
"step": 2055
},
{
"epoch": 0.7,
"grad_norm": 0.1494140625,
"learning_rate": 0.00016458776105251447,
"loss": 2.1588,
"step": 2060
},
{
"epoch": 0.7,
"grad_norm": 0.1494140625,
"learning_rate": 0.0001643617666472376,
"loss": 2.1291,
"step": 2065
},
{
"epoch": 0.7,
"grad_norm": 0.1513671875,
"learning_rate": 0.00016413520965746097,
"loss": 2.1661,
"step": 2070
},
{
"epoch": 0.7,
"grad_norm": 0.1484375,
"learning_rate": 0.0001639080920635134,
"loss": 2.1415,
"step": 2075
},
{
"epoch": 0.7,
"grad_norm": 0.1591796875,
"learning_rate": 0.0001636804158506237,
"loss": 2.1533,
"step": 2080
},
{
"epoch": 0.71,
"grad_norm": 0.15234375,
"learning_rate": 0.00016345218300890357,
"loss": 2.1245,
"step": 2085
},
{
"epoch": 0.71,
"grad_norm": 0.1533203125,
"learning_rate": 0.00016322339553333034,
"loss": 2.1427,
"step": 2090
},
{
"epoch": 0.71,
"grad_norm": 0.1484375,
"learning_rate": 0.00016299405542372924,
"loss": 2.1526,
"step": 2095
},
{
"epoch": 0.71,
"grad_norm": 0.1572265625,
"learning_rate": 0.00016276416468475607,
"loss": 2.1265,
"step": 2100
},
{
"epoch": 0.71,
"grad_norm": 0.1484375,
"learning_rate": 0.00016253372532587976,
"loss": 2.1504,
"step": 2105
},
{
"epoch": 0.71,
"grad_norm": 0.1533203125,
"learning_rate": 0.0001623027393613646,
"loss": 2.1564,
"step": 2110
},
{
"epoch": 0.72,
"grad_norm": 0.1552734375,
"learning_rate": 0.00016207120881025282,
"loss": 2.1497,
"step": 2115
},
{
"epoch": 0.72,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001618391356963468,
"loss": 2.1096,
"step": 2120
},
{
"epoch": 0.72,
"grad_norm": 0.150390625,
"learning_rate": 0.00016160652204819158,
"loss": 2.144,
"step": 2125
},
{
"epoch": 0.72,
"grad_norm": 0.154296875,
"learning_rate": 0.0001613733698990568,
"loss": 2.1447,
"step": 2130
},
{
"epoch": 0.72,
"grad_norm": 0.150390625,
"learning_rate": 0.00016113968128691933,
"loss": 2.1404,
"step": 2135
},
{
"epoch": 0.72,
"grad_norm": 0.15234375,
"learning_rate": 0.00016090545825444506,
"loss": 2.1619,
"step": 2140
},
{
"epoch": 0.73,
"grad_norm": 0.1494140625,
"learning_rate": 0.00016067070284897137,
"loss": 2.1606,
"step": 2145
},
{
"epoch": 0.73,
"grad_norm": 0.1552734375,
"learning_rate": 0.000160435417122489,
"loss": 2.1262,
"step": 2150
},
{
"epoch": 0.73,
"grad_norm": 0.1513671875,
"learning_rate": 0.00016019960313162434,
"loss": 2.1193,
"step": 2155
},
{
"epoch": 0.73,
"grad_norm": 0.15234375,
"learning_rate": 0.0001599632629376212,
"loss": 2.1529,
"step": 2160
},
{
"epoch": 0.73,
"grad_norm": 0.1484375,
"learning_rate": 0.00015972639860632292,
"loss": 2.1828,
"step": 2165
},
{
"epoch": 0.73,
"grad_norm": 0.1552734375,
"learning_rate": 0.00015948901220815445,
"loss": 2.1639,
"step": 2170
},
{
"epoch": 0.74,
"grad_norm": 0.1533203125,
"learning_rate": 0.00015925110581810394,
"loss": 2.1432,
"step": 2175
},
{
"epoch": 0.74,
"grad_norm": 0.1533203125,
"learning_rate": 0.00015901268151570491,
"loss": 2.1862,
"step": 2180
},
{
"epoch": 0.74,
"grad_norm": 0.1484375,
"learning_rate": 0.0001587737413850178,
"loss": 2.1314,
"step": 2185
},
{
"epoch": 0.74,
"grad_norm": 0.15625,
"learning_rate": 0.00015853428751461202,
"loss": 2.1317,
"step": 2190
},
{
"epoch": 0.74,
"grad_norm": 0.15234375,
"learning_rate": 0.00015829432199754756,
"loss": 2.1344,
"step": 2195
},
{
"epoch": 0.75,
"grad_norm": 0.15234375,
"learning_rate": 0.0001580538469313566,
"loss": 2.1354,
"step": 2200
},
{
"epoch": 0.75,
"grad_norm": 0.1484375,
"learning_rate": 0.00015781286441802534,
"loss": 2.1349,
"step": 2205
},
{
"epoch": 0.75,
"grad_norm": 0.1552734375,
"learning_rate": 0.00015757137656397557,
"loss": 2.1039,
"step": 2210
},
{
"epoch": 0.75,
"grad_norm": 0.15234375,
"learning_rate": 0.0001573293854800462,
"loss": 2.1074,
"step": 2215
},
{
"epoch": 0.75,
"grad_norm": 0.1533203125,
"learning_rate": 0.00015708689328147493,
"loss": 2.131,
"step": 2220
},
{
"epoch": 0.75,
"grad_norm": 0.1513671875,
"learning_rate": 0.00015684390208787962,
"loss": 2.1432,
"step": 2225
},
{
"epoch": 0.76,
"grad_norm": 0.1533203125,
"learning_rate": 0.0001566004140232399,
"loss": 2.1529,
"step": 2230
},
{
"epoch": 0.76,
"grad_norm": 0.1474609375,
"learning_rate": 0.00015635643121587848,
"loss": 2.1641,
"step": 2235
},
{
"epoch": 0.76,
"grad_norm": 0.1552734375,
"learning_rate": 0.00015611195579844265,
"loss": 2.1368,
"step": 2240
},
{
"epoch": 0.76,
"grad_norm": 0.15625,
"learning_rate": 0.00015586698990788554,
"loss": 2.1537,
"step": 2245
},
{
"epoch": 0.76,
"grad_norm": 0.1474609375,
"learning_rate": 0.00015562153568544752,
"loss": 2.1305,
"step": 2250
},
{
"epoch": 0.76,
"grad_norm": 0.15234375,
"learning_rate": 0.00015537559527663744,
"loss": 2.1831,
"step": 2255
},
{
"epoch": 0.77,
"grad_norm": 0.15625,
"learning_rate": 0.00015512917083121397,
"loss": 2.1523,
"step": 2260
},
{
"epoch": 0.77,
"grad_norm": 0.15234375,
"learning_rate": 0.00015488226450316664,
"loss": 2.1115,
"step": 2265
},
{
"epoch": 0.77,
"grad_norm": 0.1533203125,
"learning_rate": 0.00015463487845069707,
"loss": 2.1527,
"step": 2270
},
{
"epoch": 0.77,
"grad_norm": 0.1572265625,
"learning_rate": 0.00015438701483620027,
"loss": 2.1486,
"step": 2275
},
{
"epoch": 0.77,
"grad_norm": 0.15234375,
"learning_rate": 0.00015413867582624553,
"loss": 2.1296,
"step": 2280
},
{
"epoch": 0.77,
"grad_norm": 0.15234375,
"learning_rate": 0.00015388986359155758,
"loss": 2.1408,
"step": 2285
},
{
"epoch": 0.78,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001536405803069975,
"loss": 2.1033,
"step": 2290
},
{
"epoch": 0.78,
"grad_norm": 0.1513671875,
"learning_rate": 0.00015339082815154394,
"loss": 2.1389,
"step": 2295
},
{
"epoch": 0.78,
"grad_norm": 0.15625,
"learning_rate": 0.00015314060930827393,
"loss": 2.1754,
"step": 2300
},
{
"epoch": 0.78,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001528899259643437,
"loss": 2.1608,
"step": 2305
},
{
"epoch": 0.78,
"grad_norm": 0.1513671875,
"learning_rate": 0.00015263878031096975,
"loss": 2.1442,
"step": 2310
},
{
"epoch": 0.78,
"grad_norm": 0.15625,
"learning_rate": 0.00015238717454340957,
"loss": 2.1384,
"step": 2315
},
{
"epoch": 0.79,
"grad_norm": 0.1533203125,
"learning_rate": 0.00015213511086094254,
"loss": 2.1206,
"step": 2320
},
{
"epoch": 0.79,
"grad_norm": 0.15234375,
"learning_rate": 0.00015188259146685064,
"loss": 2.1266,
"step": 2325
},
{
"epoch": 0.79,
"grad_norm": 0.15234375,
"learning_rate": 0.0001516296185683992,
"loss": 2.1565,
"step": 2330
},
{
"epoch": 0.79,
"grad_norm": 0.1552734375,
"learning_rate": 0.00015137619437681767,
"loss": 2.1287,
"step": 2335
},
{
"epoch": 0.79,
"grad_norm": 0.162109375,
"learning_rate": 0.00015112232110728015,
"loss": 2.1516,
"step": 2340
},
{
"epoch": 0.79,
"grad_norm": 0.15234375,
"learning_rate": 0.00015086800097888624,
"loss": 2.1424,
"step": 2345
},
{
"epoch": 0.8,
"grad_norm": 0.15625,
"learning_rate": 0.00015061323621464134,
"loss": 2.1345,
"step": 2350
},
{
"epoch": 0.8,
"grad_norm": 0.154296875,
"learning_rate": 0.00015035802904143762,
"loss": 2.1628,
"step": 2355
},
{
"epoch": 0.8,
"grad_norm": 0.15234375,
"learning_rate": 0.0001501023816900342,
"loss": 2.168,
"step": 2360
},
{
"epoch": 0.8,
"grad_norm": 0.15625,
"learning_rate": 0.00014984629639503785,
"loss": 2.1476,
"step": 2365
},
{
"epoch": 0.8,
"grad_norm": 0.150390625,
"learning_rate": 0.0001495897753948833,
"loss": 2.1485,
"step": 2370
},
{
"epoch": 0.8,
"grad_norm": 0.1533203125,
"learning_rate": 0.00014933282093181383,
"loss": 2.1508,
"step": 2375
},
{
"epoch": 0.81,
"grad_norm": 0.15625,
"learning_rate": 0.00014907543525186166,
"loss": 2.1244,
"step": 2380
},
{
"epoch": 0.81,
"grad_norm": 0.1533203125,
"learning_rate": 0.00014881762060482814,
"loss": 2.1229,
"step": 2385
},
{
"epoch": 0.81,
"grad_norm": 0.150390625,
"learning_rate": 0.00014855937924426434,
"loss": 2.1333,
"step": 2390
},
{
"epoch": 0.81,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014830071342745112,
"loss": 2.1472,
"step": 2395
},
{
"epoch": 0.81,
"grad_norm": 0.15234375,
"learning_rate": 0.00014804162541537955,
"loss": 2.1543,
"step": 2400
},
{
"epoch": 0.81,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014778211747273114,
"loss": 2.1675,
"step": 2405
},
{
"epoch": 0.82,
"grad_norm": 0.1455078125,
"learning_rate": 0.00014752219186785784,
"loss": 2.1267,
"step": 2410
},
{
"epoch": 0.82,
"grad_norm": 0.154296875,
"learning_rate": 0.0001472618508727626,
"loss": 2.1662,
"step": 2415
},
{
"epoch": 0.82,
"grad_norm": 0.154296875,
"learning_rate": 0.00014700109676307914,
"loss": 2.1349,
"step": 2420
},
{
"epoch": 0.82,
"grad_norm": 0.15625,
"learning_rate": 0.0001467399318180522,
"loss": 2.1568,
"step": 2425
},
{
"epoch": 0.82,
"grad_norm": 0.154296875,
"learning_rate": 0.0001464783583205177,
"loss": 2.1437,
"step": 2430
},
{
"epoch": 0.82,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001462163785568826,
"loss": 2.1418,
"step": 2435
},
{
"epoch": 0.83,
"grad_norm": 0.1552734375,
"learning_rate": 0.00014595399481710515,
"loss": 2.1295,
"step": 2440
},
{
"epoch": 0.83,
"grad_norm": 0.1572265625,
"learning_rate": 0.00014569120939467465,
"loss": 2.1445,
"step": 2445
},
{
"epoch": 0.83,
"grad_norm": 0.1552734375,
"learning_rate": 0.00014542802458659152,
"loss": 2.1483,
"step": 2450
},
{
"epoch": 0.83,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001451644426933472,
"loss": 2.128,
"step": 2455
},
{
"epoch": 0.83,
"grad_norm": 0.15625,
"learning_rate": 0.00014490046601890405,
"loss": 2.1444,
"step": 2460
},
{
"epoch": 0.83,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014463609687067526,
"loss": 2.1529,
"step": 2465
},
{
"epoch": 0.84,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014437133755950448,
"loss": 2.154,
"step": 2470
},
{
"epoch": 0.84,
"grad_norm": 0.150390625,
"learning_rate": 0.00014410619039964586,
"loss": 2.1333,
"step": 2475
},
{
"epoch": 0.84,
"grad_norm": 0.1552734375,
"learning_rate": 0.00014384065770874373,
"loss": 2.1381,
"step": 2480
},
{
"epoch": 0.84,
"grad_norm": 0.15625,
"learning_rate": 0.00014357474180781232,
"loss": 2.1294,
"step": 2485
},
{
"epoch": 0.84,
"grad_norm": 0.15625,
"learning_rate": 0.00014330844502121547,
"loss": 2.1312,
"step": 2490
},
{
"epoch": 0.85,
"grad_norm": 0.1533203125,
"learning_rate": 0.00014304176967664637,
"loss": 2.1037,
"step": 2495
},
{
"epoch": 0.85,
"grad_norm": 0.154296875,
"learning_rate": 0.0001427747181051071,
"loss": 2.127,
"step": 2500
},
{
"epoch": 0.85,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014250729264088843,
"loss": 2.1565,
"step": 2505
},
{
"epoch": 0.85,
"grad_norm": 0.154296875,
"learning_rate": 0.00014223949562154929,
"loss": 2.1459,
"step": 2510
},
{
"epoch": 0.85,
"grad_norm": 0.15234375,
"learning_rate": 0.00014197132938789629,
"loss": 2.1221,
"step": 2515
},
{
"epoch": 0.85,
"grad_norm": 0.158203125,
"learning_rate": 0.0001417027962839634,
"loss": 2.15,
"step": 2520
},
{
"epoch": 0.86,
"grad_norm": 0.1513671875,
"learning_rate": 0.00014143389865699132,
"loss": 2.1569,
"step": 2525
},
{
"epoch": 0.86,
"grad_norm": 0.1552734375,
"learning_rate": 0.00014116463885740723,
"loss": 2.129,
"step": 2530
},
{
"epoch": 0.86,
"grad_norm": 0.154296875,
"learning_rate": 0.00014089501923880384,
"loss": 2.1546,
"step": 2535
},
{
"epoch": 0.86,
"grad_norm": 0.1533203125,
"learning_rate": 0.00014062504215791905,
"loss": 2.1305,
"step": 2540
},
{
"epoch": 0.86,
"grad_norm": 0.1552734375,
"learning_rate": 0.00014035470997461548,
"loss": 2.1207,
"step": 2545
},
{
"epoch": 0.86,
"grad_norm": 0.1572265625,
"learning_rate": 0.00014008402505185952,
"loss": 2.1112,
"step": 2550
},
{
"epoch": 0.87,
"grad_norm": 0.154296875,
"learning_rate": 0.000139812989755701,
"loss": 2.1232,
"step": 2555
},
{
"epoch": 0.87,
"grad_norm": 0.1552734375,
"learning_rate": 0.00013954160645525217,
"loss": 2.1253,
"step": 2560
},
{
"epoch": 0.87,
"grad_norm": 0.1591796875,
"learning_rate": 0.00013926987752266735,
"loss": 2.1361,
"step": 2565
},
{
"epoch": 0.87,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001389978053331219,
"loss": 2.1494,
"step": 2570
},
{
"epoch": 0.87,
"grad_norm": 0.15234375,
"learning_rate": 0.00013872539226479172,
"loss": 2.1389,
"step": 2575
},
{
"epoch": 0.87,
"grad_norm": 0.16015625,
"learning_rate": 0.00013845264069883216,
"loss": 2.1307,
"step": 2580
},
{
"epoch": 0.88,
"grad_norm": 0.15625,
"learning_rate": 0.00013817955301935743,
"loss": 2.1318,
"step": 2585
},
{
"epoch": 0.88,
"grad_norm": 0.15234375,
"learning_rate": 0.0001379061316134198,
"loss": 2.1408,
"step": 2590
},
{
"epoch": 0.88,
"grad_norm": 0.1611328125,
"learning_rate": 0.00013763237887098843,
"loss": 2.1299,
"step": 2595
},
{
"epoch": 0.88,
"grad_norm": 0.154296875,
"learning_rate": 0.0001373582971849289,
"loss": 2.1311,
"step": 2600
},
{
"epoch": 0.88,
"grad_norm": 0.1552734375,
"learning_rate": 0.00013708388895098192,
"loss": 2.1358,
"step": 2605
},
{
"epoch": 0.88,
"grad_norm": 0.15625,
"learning_rate": 0.00013680915656774265,
"loss": 2.1451,
"step": 2610
},
{
"epoch": 0.89,
"grad_norm": 0.1572265625,
"learning_rate": 0.00013653410243663952,
"loss": 2.1328,
"step": 2615
},
{
"epoch": 0.89,
"grad_norm": 0.154296875,
"learning_rate": 0.00013625872896191345,
"loss": 2.1629,
"step": 2620
},
{
"epoch": 0.89,
"grad_norm": 0.16015625,
"learning_rate": 0.0001359830385505967,
"loss": 2.1212,
"step": 2625
},
{
"epoch": 0.89,
"grad_norm": 0.1552734375,
"learning_rate": 0.00013570703361249188,
"loss": 2.1541,
"step": 2630
},
{
"epoch": 0.89,
"grad_norm": 0.158203125,
"learning_rate": 0.00013543071656015084,
"loss": 2.122,
"step": 2635
},
{
"epoch": 0.89,
"grad_norm": 0.158203125,
"learning_rate": 0.0001351540898088536,
"loss": 2.1543,
"step": 2640
},
{
"epoch": 0.9,
"grad_norm": 0.1552734375,
"learning_rate": 0.00013487715577658726,
"loss": 2.1587,
"step": 2645
},
{
"epoch": 0.9,
"grad_norm": 0.150390625,
"learning_rate": 0.00013459991688402492,
"loss": 2.1158,
"step": 2650
},
{
"epoch": 0.9,
"grad_norm": 0.154296875,
"learning_rate": 0.00013432237555450444,
"loss": 2.1499,
"step": 2655
},
{
"epoch": 0.9,
"grad_norm": 0.154296875,
"learning_rate": 0.00013404453421400714,
"loss": 2.152,
"step": 2660
},
{
"epoch": 0.9,
"grad_norm": 0.15625,
"learning_rate": 0.00013376639529113688,
"loss": 2.1233,
"step": 2665
},
{
"epoch": 0.9,
"grad_norm": 0.15234375,
"learning_rate": 0.00013348796121709862,
"loss": 2.1329,
"step": 2670
},
{
"epoch": 0.91,
"grad_norm": 0.1572265625,
"learning_rate": 0.00013320923442567727,
"loss": 2.1402,
"step": 2675
},
{
"epoch": 0.91,
"grad_norm": 0.15234375,
"learning_rate": 0.00013293021735321628,
"loss": 2.1526,
"step": 2680
},
{
"epoch": 0.91,
"grad_norm": 0.1591796875,
"learning_rate": 0.00013265091243859652,
"loss": 2.1235,
"step": 2685
},
{
"epoch": 0.91,
"grad_norm": 0.15625,
"learning_rate": 0.00013237132212321487,
"loss": 2.1442,
"step": 2690
},
{
"epoch": 0.91,
"grad_norm": 0.158203125,
"learning_rate": 0.0001320914488509629,
"loss": 2.1558,
"step": 2695
},
{
"epoch": 0.91,
"grad_norm": 0.1630859375,
"learning_rate": 0.00013181129506820545,
"loss": 2.1194,
"step": 2700
},
{
"epoch": 0.92,
"grad_norm": 0.158203125,
"learning_rate": 0.0001315308632237593,
"loss": 2.1201,
"step": 2705
},
{
"epoch": 0.92,
"grad_norm": 0.1533203125,
"learning_rate": 0.00013125015576887186,
"loss": 2.1523,
"step": 2710
},
{
"epoch": 0.92,
"grad_norm": 0.15625,
"learning_rate": 0.00013096917515719952,
"loss": 2.1441,
"step": 2715
},
{
"epoch": 0.92,
"grad_norm": 0.16015625,
"learning_rate": 0.00013068792384478636,
"loss": 2.1487,
"step": 2720
},
{
"epoch": 0.92,
"grad_norm": 0.1552734375,
"learning_rate": 0.00013040640429004267,
"loss": 2.1624,
"step": 2725
},
{
"epoch": 0.92,
"grad_norm": 0.1513671875,
"learning_rate": 0.00013012461895372344,
"loss": 2.1395,
"step": 2730
},
{
"epoch": 0.93,
"grad_norm": 0.15625,
"learning_rate": 0.00012984257029890683,
"loss": 2.1419,
"step": 2735
},
{
"epoch": 0.93,
"grad_norm": 0.1552734375,
"learning_rate": 0.00012956026079097272,
"loss": 2.1476,
"step": 2740
},
{
"epoch": 0.93,
"grad_norm": 0.1533203125,
"learning_rate": 0.000129277692897581,
"loss": 2.1373,
"step": 2745
},
{
"epoch": 0.93,
"grad_norm": 0.158203125,
"learning_rate": 0.00012899486908865012,
"loss": 2.138,
"step": 2750
},
{
"epoch": 0.93,
"grad_norm": 0.154296875,
"learning_rate": 0.0001287117918363356,
"loss": 2.1418,
"step": 2755
},
{
"epoch": 0.93,
"grad_norm": 0.15625,
"learning_rate": 0.00012842846361500816,
"loss": 2.1108,
"step": 2760
},
{
"epoch": 0.94,
"grad_norm": 0.1572265625,
"learning_rate": 0.00012814488690123226,
"loss": 2.1407,
"step": 2765
},
{
"epoch": 0.94,
"grad_norm": 0.1572265625,
"learning_rate": 0.00012786106417374455,
"loss": 2.1761,
"step": 2770
},
{
"epoch": 0.94,
"grad_norm": 0.158203125,
"learning_rate": 0.00012757699791343186,
"loss": 2.1446,
"step": 2775
},
{
"epoch": 0.94,
"grad_norm": 0.15625,
"learning_rate": 0.00012729269060330999,
"loss": 2.1306,
"step": 2780
},
{
"epoch": 0.94,
"grad_norm": 0.15234375,
"learning_rate": 0.0001270081447285015,
"loss": 2.1432,
"step": 2785
},
{
"epoch": 0.94,
"grad_norm": 0.158203125,
"learning_rate": 0.00012672336277621442,
"loss": 2.1351,
"step": 2790
},
{
"epoch": 0.95,
"grad_norm": 0.16015625,
"learning_rate": 0.0001264383472357202,
"loss": 2.1395,
"step": 2795
},
{
"epoch": 0.95,
"grad_norm": 0.1611328125,
"learning_rate": 0.0001261531005983322,
"loss": 2.1492,
"step": 2800
},
{
"epoch": 0.95,
"grad_norm": 0.158203125,
"learning_rate": 0.00012586762535738374,
"loss": 2.1345,
"step": 2805
},
{
"epoch": 0.95,
"grad_norm": 0.158203125,
"learning_rate": 0.0001255819240082063,
"loss": 2.1352,
"step": 2810
},
{
"epoch": 0.95,
"grad_norm": 0.158203125,
"learning_rate": 0.00012529599904810784,
"loss": 2.1554,
"step": 2815
},
{
"epoch": 0.96,
"grad_norm": 0.16015625,
"learning_rate": 0.00012500985297635088,
"loss": 2.1249,
"step": 2820
},
{
"epoch": 0.96,
"grad_norm": 0.1591796875,
"learning_rate": 0.00012472348829413064,
"loss": 2.1478,
"step": 2825
},
{
"epoch": 0.96,
"grad_norm": 0.1591796875,
"learning_rate": 0.00012443690750455326,
"loss": 2.1159,
"step": 2830
},
{
"epoch": 0.96,
"grad_norm": 0.1533203125,
"learning_rate": 0.0001241501131126138,
"loss": 2.1667,
"step": 2835
},
{
"epoch": 0.96,
"grad_norm": 0.1572265625,
"learning_rate": 0.00012386310762517452,
"loss": 2.134,
"step": 2840
},
{
"epoch": 0.96,
"grad_norm": 0.158203125,
"learning_rate": 0.00012357589355094275,
"loss": 2.1089,
"step": 2845
},
{
"epoch": 0.97,
"grad_norm": 0.15625,
"learning_rate": 0.0001232884734004491,
"loss": 2.1316,
"step": 2850
},
{
"epoch": 0.97,
"grad_norm": 0.154296875,
"learning_rate": 0.00012300084968602549,
"loss": 2.1116,
"step": 2855
},
{
"epoch": 0.97,
"grad_norm": 0.154296875,
"learning_rate": 0.00012271302492178327,
"loss": 2.138,
"step": 2860
},
{
"epoch": 0.97,
"grad_norm": 0.1552734375,
"learning_rate": 0.00012242500162359105,
"loss": 2.1222,
"step": 2865
},
{
"epoch": 0.97,
"grad_norm": 0.1591796875,
"learning_rate": 0.00012213678230905284,
"loss": 2.1652,
"step": 2870
},
{
"epoch": 0.97,
"grad_norm": 0.154296875,
"learning_rate": 0.00012184836949748608,
"loss": 2.1159,
"step": 2875
},
{
"epoch": 0.98,
"grad_norm": 0.1611328125,
"learning_rate": 0.00012155976570989949,
"loss": 2.124,
"step": 2880
},
{
"epoch": 0.98,
"grad_norm": 0.1591796875,
"learning_rate": 0.0001212709734689712,
"loss": 2.1345,
"step": 2885
},
{
"epoch": 0.98,
"grad_norm": 0.158203125,
"learning_rate": 0.00012098199529902648,
"loss": 2.134,
"step": 2890
},
{
"epoch": 0.98,
"grad_norm": 0.154296875,
"learning_rate": 0.0001206928337260159,
"loss": 2.1264,
"step": 2895
},
{
"epoch": 0.98,
"grad_norm": 0.158203125,
"learning_rate": 0.00012040349127749313,
"loss": 2.153,
"step": 2900
},
{
"epoch": 0.98,
"grad_norm": 0.15625,
"learning_rate": 0.00012011397048259285,
"loss": 2.1381,
"step": 2905
},
{
"epoch": 0.99,
"grad_norm": 0.154296875,
"learning_rate": 0.00011982427387200867,
"loss": 2.1187,
"step": 2910
},
{
"epoch": 0.99,
"grad_norm": 0.158203125,
"learning_rate": 0.00011953440397797097,
"loss": 2.1583,
"step": 2915
},
{
"epoch": 0.99,
"grad_norm": 0.1552734375,
"learning_rate": 0.00011924436333422489,
"loss": 2.1315,
"step": 2920
},
{
"epoch": 0.99,
"grad_norm": 0.1552734375,
"learning_rate": 0.000118954154476008,
"loss": 2.1223,
"step": 2925
},
{
"epoch": 0.99,
"grad_norm": 0.1572265625,
"learning_rate": 0.0001186637799400282,
"loss": 2.1399,
"step": 2930
},
{
"epoch": 0.99,
"grad_norm": 0.1552734375,
"learning_rate": 0.00011837324226444169,
"loss": 2.1389,
"step": 2935
},
{
"epoch": 1.0,
"grad_norm": 0.1552734375,
"learning_rate": 0.00011808254398883056,
"loss": 2.1443,
"step": 2940
},
{
"epoch": 1.0,
"grad_norm": 0.1552734375,
"learning_rate": 0.00011779168765418079,
"loss": 2.1315,
"step": 2945
},
{
"epoch": 1.0,
"grad_norm": 0.1572265625,
"learning_rate": 0.00011750067580285988,
"loss": 2.1288,
"step": 2950
},
{
"epoch": 1.0,
"eval_loss": 2.133820056915283,
"eval_runtime": 154.3826,
"eval_samples_per_second": 8.453,
"eval_steps_per_second": 1.062,
"step": 2952
},
{
"epoch": 1.0,
"grad_norm": 0.1572265625,
"learning_rate": 0.00011720951097859476,
"loss": 2.1114,
"step": 2955
},
{
"epoch": 1.0,
"grad_norm": 0.154296875,
"learning_rate": 0.00011691819572644939,
"loss": 2.0924,
"step": 2960
},
{
"epoch": 1.0,
"grad_norm": 0.15234375,
"learning_rate": 0.00011662673259280276,
"loss": 2.1001,
"step": 2965
},
{
"epoch": 1.01,
"grad_norm": 0.1572265625,
"learning_rate": 0.00011633512412532637,
"loss": 2.1237,
"step": 2970
},
{
"epoch": 1.01,
"grad_norm": 0.158203125,
"learning_rate": 0.0001160433728729621,
"loss": 2.1129,
"step": 2975
},
{
"epoch": 1.01,
"grad_norm": 0.158203125,
"learning_rate": 0.00011575148138589996,
"loss": 2.1217,
"step": 2980
},
{
"epoch": 1.01,
"grad_norm": 0.158203125,
"learning_rate": 0.0001154594522155557,
"loss": 2.1345,
"step": 2985
},
{
"epoch": 1.01,
"grad_norm": 0.1630859375,
"learning_rate": 0.00011516728791454861,
"loss": 2.1045,
"step": 2990
},
{
"epoch": 1.01,
"grad_norm": 0.162109375,
"learning_rate": 0.00011487499103667904,
"loss": 2.0972,
"step": 2995
},
{
"epoch": 1.02,
"grad_norm": 0.16015625,
"learning_rate": 0.00011458256413690633,
"loss": 2.0935,
"step": 3000
},
{
"epoch": 1.02,
"grad_norm": 0.162109375,
"learning_rate": 0.00011429000977132629,
"loss": 2.1148,
"step": 3005
},
{
"epoch": 1.02,
"grad_norm": 0.162109375,
"learning_rate": 0.00011399733049714884,
"loss": 2.1049,
"step": 3010
},
{
"epoch": 1.02,
"grad_norm": 0.1611328125,
"learning_rate": 0.00011370452887267582,
"loss": 2.0877,
"step": 3015
},
{
"epoch": 1.02,
"grad_norm": 0.1640625,
"learning_rate": 0.00011341160745727844,
"loss": 2.1226,
"step": 3020
},
{
"epoch": 1.02,
"grad_norm": 0.16015625,
"learning_rate": 0.00011311856881137506,
"loss": 2.1123,
"step": 3025
},
{
"epoch": 1.03,
"grad_norm": 0.162109375,
"learning_rate": 0.00011282541549640873,
"loss": 2.0914,
"step": 3030
},
{
"epoch": 1.03,
"grad_norm": 0.1630859375,
"learning_rate": 0.0001125321500748248,
"loss": 2.0834,
"step": 3035
},
{
"epoch": 1.03,
"grad_norm": 0.162109375,
"learning_rate": 0.00011223877511004863,
"loss": 2.1109,
"step": 3040
},
{
"epoch": 1.03,
"grad_norm": 0.16015625,
"learning_rate": 0.00011194529316646293,
"loss": 2.1353,
"step": 3045
},
{
"epoch": 1.03,
"grad_norm": 0.1591796875,
"learning_rate": 0.00011165170680938572,
"loss": 2.1066,
"step": 3050
},
{
"epoch": 1.03,
"grad_norm": 0.1572265625,
"learning_rate": 0.00011135801860504749,
"loss": 2.1288,
"step": 3055
},
{
"epoch": 1.04,
"grad_norm": 0.1611328125,
"learning_rate": 0.00011106423112056911,
"loss": 2.1506,
"step": 3060
},
{
"epoch": 1.04,
"grad_norm": 0.16015625,
"learning_rate": 0.00011077034692393917,
"loss": 2.1051,
"step": 3065
},
{
"epoch": 1.04,
"grad_norm": 0.1591796875,
"learning_rate": 0.00011047636858399169,
"loss": 2.1196,
"step": 3070
},
{
"epoch": 1.04,
"grad_norm": 0.1611328125,
"learning_rate": 0.00011018229867038356,
"loss": 2.0859,
"step": 3075
},
{
"epoch": 1.04,
"grad_norm": 0.1650390625,
"learning_rate": 0.00010988813975357208,
"loss": 2.1272,
"step": 3080
},
{
"epoch": 1.04,
"grad_norm": 0.1591796875,
"learning_rate": 0.00010959389440479264,
"loss": 2.1135,
"step": 3085
},
{
"epoch": 1.05,
"grad_norm": 0.1611328125,
"learning_rate": 0.00010929956519603594,
"loss": 2.1099,
"step": 3090
},
{
"epoch": 1.05,
"grad_norm": 0.1650390625,
"learning_rate": 0.00010900515470002595,
"loss": 2.122,
"step": 3095
},
{
"epoch": 1.05,
"grad_norm": 0.1640625,
"learning_rate": 0.00010871066549019688,
"loss": 2.1026,
"step": 3100
},
{
"epoch": 1.05,
"grad_norm": 0.162109375,
"learning_rate": 0.0001084161001406712,
"loss": 2.1147,
"step": 3105
},
{
"epoch": 1.05,
"grad_norm": 0.16015625,
"learning_rate": 0.00010812146122623683,
"loss": 2.0943,
"step": 3110
},
{
"epoch": 1.06,
"grad_norm": 0.162109375,
"learning_rate": 0.00010782675132232474,
"loss": 2.133,
"step": 3115
},
{
"epoch": 1.06,
"grad_norm": 0.16015625,
"learning_rate": 0.00010753197300498638,
"loss": 2.1119,
"step": 3120
},
{
"epoch": 1.06,
"grad_norm": 0.166015625,
"learning_rate": 0.00010723712885087123,
"loss": 2.0959,
"step": 3125
},
{
"epoch": 1.06,
"grad_norm": 0.1611328125,
"learning_rate": 0.00010694222143720423,
"loss": 2.0654,
"step": 3130
},
{
"epoch": 1.06,
"grad_norm": 0.1611328125,
"learning_rate": 0.00010664725334176331,
"loss": 2.1195,
"step": 3135
},
{
"epoch": 1.06,
"grad_norm": 0.1650390625,
"learning_rate": 0.00010635222714285676,
"loss": 2.1052,
"step": 3140
},
{
"epoch": 1.07,
"grad_norm": 0.16796875,
"learning_rate": 0.0001060571454193008,
"loss": 2.1175,
"step": 3145
},
{
"epoch": 1.07,
"grad_norm": 0.1640625,
"learning_rate": 0.00010576201075039696,
"loss": 2.0998,
"step": 3150
},
{
"epoch": 1.07,
"grad_norm": 0.1650390625,
"learning_rate": 0.00010546682571590958,
"loss": 2.1202,
"step": 3155
},
{
"epoch": 1.07,
"grad_norm": 0.1640625,
"learning_rate": 0.00010517159289604324,
"loss": 2.1058,
"step": 3160
},
{
"epoch": 1.07,
"grad_norm": 0.162109375,
"learning_rate": 0.00010487631487142017,
"loss": 2.135,
"step": 3165
},
{
"epoch": 1.07,
"grad_norm": 0.162109375,
"learning_rate": 0.00010458099422305785,
"loss": 2.1585,
"step": 3170
},
{
"epoch": 1.08,
"grad_norm": 0.1669921875,
"learning_rate": 0.0001042856335323462,
"loss": 2.1095,
"step": 3175
},
{
"epoch": 1.08,
"grad_norm": 0.16015625,
"learning_rate": 0.00010399023538102522,
"loss": 2.1009,
"step": 3180
},
{
"epoch": 1.08,
"grad_norm": 0.16015625,
"learning_rate": 0.00010369480235116229,
"loss": 2.1125,
"step": 3185
},
{
"epoch": 1.08,
"grad_norm": 0.1650390625,
"learning_rate": 0.00010339933702512979,
"loss": 2.156,
"step": 3190
},
{
"epoch": 1.08,
"grad_norm": 0.1640625,
"learning_rate": 0.00010310384198558225,
"loss": 2.1253,
"step": 3195
},
{
"epoch": 1.08,
"grad_norm": 0.16015625,
"learning_rate": 0.00010280831981543405,
"loss": 2.0947,
"step": 3200
},
{
"epoch": 1.09,
"grad_norm": 0.1640625,
"learning_rate": 0.00010251277309783663,
"loss": 2.0967,
"step": 3205
},
{
"epoch": 1.09,
"grad_norm": 0.1591796875,
"learning_rate": 0.00010221720441615599,
"loss": 2.1105,
"step": 3210
},
{
"epoch": 1.09,
"grad_norm": 0.162109375,
"learning_rate": 0.00010192161635395026,
"loss": 2.0715,
"step": 3215
},
{
"epoch": 1.09,
"grad_norm": 0.1591796875,
"learning_rate": 0.00010162601149494676,
"loss": 2.1251,
"step": 3220
},
{
"epoch": 1.09,
"grad_norm": 0.1630859375,
"learning_rate": 0.00010133039242301985,
"loss": 2.0996,
"step": 3225
},
{
"epoch": 1.09,
"grad_norm": 0.1611328125,
"learning_rate": 0.00010103476172216792,
"loss": 2.1182,
"step": 3230
},
{
"epoch": 1.1,
"grad_norm": 0.162109375,
"learning_rate": 0.00010073912197649116,
"loss": 2.0768,
"step": 3235
},
{
"epoch": 1.1,
"grad_norm": 0.162109375,
"learning_rate": 0.0001004434757701688,
"loss": 2.1404,
"step": 3240
},
{
"epoch": 1.1,
"grad_norm": 0.1630859375,
"learning_rate": 0.00010014782568743641,
"loss": 2.1353,
"step": 3245
},
{
"epoch": 1.1,
"grad_norm": 0.1630859375,
"learning_rate": 9.98521743125636e-05,
"loss": 2.1234,
"step": 3250
},
{
"epoch": 1.1,
"grad_norm": 0.1630859375,
"learning_rate": 9.955652422983122e-05,
"loss": 2.1039,
"step": 3255
},
{
"epoch": 1.1,
"grad_norm": 0.16796875,
"learning_rate": 9.926087802350886e-05,
"loss": 2.116,
"step": 3260
},
{
"epoch": 1.11,
"grad_norm": 0.1650390625,
"learning_rate": 9.896523827783207e-05,
"loss": 2.1135,
"step": 3265
},
{
"epoch": 1.11,
"grad_norm": 0.1640625,
"learning_rate": 9.866960757698017e-05,
"loss": 2.1019,
"step": 3270
},
{
"epoch": 1.11,
"grad_norm": 0.162109375,
"learning_rate": 9.837398850505324e-05,
"loss": 2.1152,
"step": 3275
},
{
"epoch": 1.11,
"grad_norm": 0.162109375,
"learning_rate": 9.807838364604978e-05,
"loss": 2.119,
"step": 3280
},
{
"epoch": 1.11,
"grad_norm": 0.1611328125,
"learning_rate": 9.7782795583844e-05,
"loss": 2.1126,
"step": 3285
},
{
"epoch": 1.11,
"grad_norm": 0.1630859375,
"learning_rate": 9.748722690216341e-05,
"loss": 2.1007,
"step": 3290
},
{
"epoch": 1.12,
"grad_norm": 0.162109375,
"learning_rate": 9.719168018456598e-05,
"loss": 2.1042,
"step": 3295
},
{
"epoch": 1.12,
"grad_norm": 0.1611328125,
"learning_rate": 9.689615801441774e-05,
"loss": 2.1124,
"step": 3300
},
{
"epoch": 1.12,
"grad_norm": 0.1640625,
"learning_rate": 9.660066297487022e-05,
"loss": 2.126,
"step": 3305
},
{
"epoch": 1.12,
"grad_norm": 0.1611328125,
"learning_rate": 9.630519764883772e-05,
"loss": 2.1014,
"step": 3310
},
{
"epoch": 1.12,
"grad_norm": 0.1640625,
"learning_rate": 9.600976461897483e-05,
"loss": 2.1159,
"step": 3315
},
{
"epoch": 1.12,
"grad_norm": 0.1650390625,
"learning_rate": 9.571436646765382e-05,
"loss": 2.1129,
"step": 3320
},
{
"epoch": 1.13,
"grad_norm": 0.162109375,
"learning_rate": 9.541900577694217e-05,
"loss": 2.1212,
"step": 3325
},
{
"epoch": 1.13,
"grad_norm": 0.1611328125,
"learning_rate": 9.512368512857984e-05,
"loss": 2.1061,
"step": 3330
},
{
"epoch": 1.13,
"grad_norm": 0.162109375,
"learning_rate": 9.482840710395675e-05,
"loss": 2.1036,
"step": 3335
},
{
"epoch": 1.13,
"grad_norm": 0.1650390625,
"learning_rate": 9.453317428409044e-05,
"loss": 2.1095,
"step": 3340
},
{
"epoch": 1.13,
"grad_norm": 0.166015625,
"learning_rate": 9.423798924960306e-05,
"loss": 2.1064,
"step": 3345
},
{
"epoch": 1.13,
"grad_norm": 0.16796875,
"learning_rate": 9.394285458069923e-05,
"loss": 2.1397,
"step": 3350
},
{
"epoch": 1.14,
"grad_norm": 0.1669921875,
"learning_rate": 9.364777285714324e-05,
"loss": 2.1006,
"step": 3355
},
{
"epoch": 1.14,
"grad_norm": 0.1650390625,
"learning_rate": 9.33527466582367e-05,
"loss": 2.1189,
"step": 3360
},
{
"epoch": 1.14,
"grad_norm": 0.166015625,
"learning_rate": 9.30577785627958e-05,
"loss": 2.1267,
"step": 3365
},
{
"epoch": 1.14,
"grad_norm": 0.1630859375,
"learning_rate": 9.276287114912878e-05,
"loss": 2.1253,
"step": 3370
},
{
"epoch": 1.14,
"grad_norm": 0.16796875,
"learning_rate": 9.246802699501363e-05,
"loss": 2.1334,
"step": 3375
},
{
"epoch": 1.14,
"grad_norm": 0.1630859375,
"learning_rate": 9.217324867767527e-05,
"loss": 2.1119,
"step": 3380
},
{
"epoch": 1.15,
"grad_norm": 0.1650390625,
"learning_rate": 9.187853877376318e-05,
"loss": 2.1104,
"step": 3385
},
{
"epoch": 1.15,
"grad_norm": 0.1669921875,
"learning_rate": 9.158389985932881e-05,
"loss": 2.1237,
"step": 3390
},
{
"epoch": 1.15,
"grad_norm": 0.1640625,
"learning_rate": 9.128933450980314e-05,
"loss": 2.1228,
"step": 3395
},
{
"epoch": 1.15,
"grad_norm": 0.1611328125,
"learning_rate": 9.099484529997409e-05,
"loss": 2.0859,
"step": 3400
},
{
"epoch": 1.15,
"grad_norm": 0.166015625,
"learning_rate": 9.070043480396404e-05,
"loss": 2.0926,
"step": 3405
},
{
"epoch": 1.15,
"grad_norm": 0.1640625,
"learning_rate": 9.04061055952074e-05,
"loss": 2.1346,
"step": 3410
},
{
"epoch": 1.16,
"grad_norm": 0.162109375,
"learning_rate": 9.011186024642793e-05,
"loss": 2.0959,
"step": 3415
},
{
"epoch": 1.16,
"grad_norm": 0.1611328125,
"learning_rate": 8.981770132961649e-05,
"loss": 2.1153,
"step": 3420
},
{
"epoch": 1.16,
"grad_norm": 0.162109375,
"learning_rate": 8.952363141600834e-05,
"loss": 2.1073,
"step": 3425
},
{
"epoch": 1.16,
"grad_norm": 0.1728515625,
"learning_rate": 8.922965307606086e-05,
"loss": 2.1085,
"step": 3430
},
{
"epoch": 1.16,
"grad_norm": 0.1630859375,
"learning_rate": 8.893576887943094e-05,
"loss": 2.0935,
"step": 3435
},
{
"epoch": 1.17,
"grad_norm": 0.1669921875,
"learning_rate": 8.86419813949525e-05,
"loss": 2.091,
"step": 3440
},
{
"epoch": 1.17,
"grad_norm": 0.166015625,
"learning_rate": 8.834829319061431e-05,
"loss": 2.1139,
"step": 3445
},
{
"epoch": 1.17,
"grad_norm": 0.16796875,
"learning_rate": 8.805470683353708e-05,
"loss": 2.1199,
"step": 3450
},
{
"epoch": 1.17,
"grad_norm": 0.171875,
"learning_rate": 8.77612248899514e-05,
"loss": 2.1037,
"step": 3455
},
{
"epoch": 1.17,
"grad_norm": 0.1630859375,
"learning_rate": 8.746784992517518e-05,
"loss": 2.0909,
"step": 3460
},
{
"epoch": 1.17,
"grad_norm": 0.1640625,
"learning_rate": 8.71745845035913e-05,
"loss": 2.1016,
"step": 3465
},
{
"epoch": 1.18,
"grad_norm": 0.1640625,
"learning_rate": 8.688143118862499e-05,
"loss": 2.1214,
"step": 3470
},
{
"epoch": 1.18,
"grad_norm": 0.16796875,
"learning_rate": 8.658839254272157e-05,
"loss": 2.1114,
"step": 3475
},
{
"epoch": 1.18,
"grad_norm": 0.169921875,
"learning_rate": 8.62954711273242e-05,
"loss": 2.1117,
"step": 3480
},
{
"epoch": 1.18,
"grad_norm": 0.162109375,
"learning_rate": 8.600266950285117e-05,
"loss": 2.1494,
"step": 3485
},
{
"epoch": 1.18,
"grad_norm": 0.1689453125,
"learning_rate": 8.570999022867373e-05,
"loss": 2.1147,
"step": 3490
},
{
"epoch": 1.18,
"grad_norm": 0.16796875,
"learning_rate": 8.541743586309365e-05,
"loss": 2.0888,
"step": 3495
},
{
"epoch": 1.19,
"grad_norm": 0.166015625,
"learning_rate": 8.512500896332097e-05,
"loss": 2.1074,
"step": 3500
},
{
"epoch": 1.19,
"grad_norm": 0.16796875,
"learning_rate": 8.483271208545144e-05,
"loss": 2.096,
"step": 3505
},
{
"epoch": 1.19,
"grad_norm": 0.1689453125,
"learning_rate": 8.454054778444431e-05,
"loss": 2.1305,
"step": 3510
},
{
"epoch": 1.19,
"grad_norm": 0.166015625,
"learning_rate": 8.424851861410007e-05,
"loss": 2.0802,
"step": 3515
},
{
"epoch": 1.19,
"grad_norm": 0.1728515625,
"learning_rate": 8.395662712703793e-05,
"loss": 2.0963,
"step": 3520
},
{
"epoch": 1.19,
"grad_norm": 0.16796875,
"learning_rate": 8.366487587467368e-05,
"loss": 2.1136,
"step": 3525
},
{
"epoch": 1.2,
"grad_norm": 0.16796875,
"learning_rate": 8.337326740719726e-05,
"loss": 2.0982,
"step": 3530
},
{
"epoch": 1.2,
"grad_norm": 0.1650390625,
"learning_rate": 8.308180427355062e-05,
"loss": 2.1252,
"step": 3535
},
{
"epoch": 1.2,
"grad_norm": 0.1640625,
"learning_rate": 8.279048902140528e-05,
"loss": 2.1102,
"step": 3540
},
{
"epoch": 1.2,
"grad_norm": 0.16796875,
"learning_rate": 8.24993241971401e-05,
"loss": 2.1124,
"step": 3545
},
{
"epoch": 1.2,
"grad_norm": 0.1669921875,
"learning_rate": 8.220831234581922e-05,
"loss": 2.1135,
"step": 3550
},
{
"epoch": 1.2,
"grad_norm": 0.166015625,
"learning_rate": 8.191745601116947e-05,
"loss": 2.1134,
"step": 3555
},
{
"epoch": 1.21,
"grad_norm": 0.16015625,
"learning_rate": 8.162675773555836e-05,
"loss": 2.0979,
"step": 3560
},
{
"epoch": 1.21,
"grad_norm": 0.162109375,
"learning_rate": 8.133622005997181e-05,
"loss": 2.095,
"step": 3565
},
{
"epoch": 1.21,
"grad_norm": 0.1689453125,
"learning_rate": 8.104584552399204e-05,
"loss": 2.1298,
"step": 3570
},
{
"epoch": 1.21,
"grad_norm": 0.166015625,
"learning_rate": 8.075563666577515e-05,
"loss": 2.1018,
"step": 3575
},
{
"epoch": 1.21,
"grad_norm": 0.1708984375,
"learning_rate": 8.046559602202901e-05,
"loss": 2.1075,
"step": 3580
},
{
"epoch": 1.21,
"grad_norm": 0.16796875,
"learning_rate": 8.017572612799135e-05,
"loss": 2.1308,
"step": 3585
},
{
"epoch": 1.22,
"grad_norm": 0.162109375,
"learning_rate": 7.988602951740717e-05,
"loss": 2.1215,
"step": 3590
},
{
"epoch": 1.22,
"grad_norm": 0.166015625,
"learning_rate": 7.959650872250688e-05,
"loss": 2.1063,
"step": 3595
},
{
"epoch": 1.22,
"grad_norm": 0.1650390625,
"learning_rate": 7.930716627398412e-05,
"loss": 2.1156,
"step": 3600
},
{
"epoch": 1.22,
"grad_norm": 0.1669921875,
"learning_rate": 7.901800470097355e-05,
"loss": 2.1169,
"step": 3605
},
{
"epoch": 1.22,
"grad_norm": 0.166015625,
"learning_rate": 7.872902653102884e-05,
"loss": 2.0943,
"step": 3610
},
{
"epoch": 1.22,
"grad_norm": 0.169921875,
"learning_rate": 7.84402342901005e-05,
"loss": 2.1382,
"step": 3615
},
{
"epoch": 1.23,
"grad_norm": 0.169921875,
"learning_rate": 7.815163050251395e-05,
"loss": 2.1235,
"step": 3620
},
{
"epoch": 1.23,
"grad_norm": 0.1630859375,
"learning_rate": 7.786321769094717e-05,
"loss": 2.1157,
"step": 3625
},
{
"epoch": 1.23,
"grad_norm": 0.16796875,
"learning_rate": 7.7574998376409e-05,
"loss": 2.107,
"step": 3630
},
{
"epoch": 1.23,
"grad_norm": 0.1689453125,
"learning_rate": 7.728697507821674e-05,
"loss": 2.1166,
"step": 3635
},
{
"epoch": 1.23,
"grad_norm": 0.1689453125,
"learning_rate": 7.699915031397452e-05,
"loss": 2.1276,
"step": 3640
},
{
"epoch": 1.23,
"grad_norm": 0.1689453125,
"learning_rate": 7.671152659955096e-05,
"loss": 2.12,
"step": 3645
},
{
"epoch": 1.24,
"grad_norm": 0.1630859375,
"learning_rate": 7.642410644905726e-05,
"loss": 2.1143,
"step": 3650
},
{
"epoch": 1.24,
"grad_norm": 0.162109375,
"learning_rate": 7.613689237482551e-05,
"loss": 2.1057,
"step": 3655
},
{
"epoch": 1.24,
"grad_norm": 0.1650390625,
"learning_rate": 7.584988688738622e-05,
"loss": 2.0954,
"step": 3660
},
{
"epoch": 1.24,
"grad_norm": 0.1650390625,
"learning_rate": 7.556309249544678e-05,
"loss": 2.1458,
"step": 3665
},
{
"epoch": 1.24,
"grad_norm": 0.16796875,
"learning_rate": 7.527651170586936e-05,
"loss": 2.106,
"step": 3670
},
{
"epoch": 1.24,
"grad_norm": 0.1640625,
"learning_rate": 7.499014702364913e-05,
"loss": 2.112,
"step": 3675
},
{
"epoch": 1.25,
"grad_norm": 0.166015625,
"learning_rate": 7.470400095189219e-05,
"loss": 2.1394,
"step": 3680
},
{
"epoch": 1.25,
"grad_norm": 0.1640625,
"learning_rate": 7.44180759917937e-05,
"loss": 2.0966,
"step": 3685
},
{
"epoch": 1.25,
"grad_norm": 0.169921875,
"learning_rate": 7.413237464261627e-05,
"loss": 2.1037,
"step": 3690
},
{
"epoch": 1.25,
"grad_norm": 0.1689453125,
"learning_rate": 7.38468994016678e-05,
"loss": 2.1184,
"step": 3695
},
{
"epoch": 1.25,
"grad_norm": 0.1669921875,
"learning_rate": 7.356165276427983e-05,
"loss": 2.1213,
"step": 3700
},
{
"epoch": 1.25,
"grad_norm": 0.1611328125,
"learning_rate": 7.327663722378561e-05,
"loss": 2.0926,
"step": 3705
},
{
"epoch": 1.26,
"grad_norm": 0.1640625,
"learning_rate": 7.299185527149853e-05,
"loss": 2.1019,
"step": 3710
},
{
"epoch": 1.26,
"grad_norm": 0.166015625,
"learning_rate": 7.270730939669006e-05,
"loss": 2.0903,
"step": 3715
},
{
"epoch": 1.26,
"grad_norm": 0.166015625,
"learning_rate": 7.242300208656814e-05,
"loss": 2.1252,
"step": 3720
},
{
"epoch": 1.26,
"grad_norm": 0.162109375,
"learning_rate": 7.213893582625548e-05,
"loss": 2.1357,
"step": 3725
},
{
"epoch": 1.26,
"grad_norm": 0.169921875,
"learning_rate": 7.185511309876775e-05,
"loss": 2.0909,
"step": 3730
},
{
"epoch": 1.27,
"grad_norm": 0.166015625,
"learning_rate": 7.157153638499188e-05,
"loss": 2.0689,
"step": 3735
},
{
"epoch": 1.27,
"grad_norm": 0.1640625,
"learning_rate": 7.128820816366442e-05,
"loss": 2.1063,
"step": 3740
},
{
"epoch": 1.27,
"grad_norm": 0.1689453125,
"learning_rate": 7.100513091134989e-05,
"loss": 2.1295,
"step": 3745
},
{
"epoch": 1.27,
"grad_norm": 0.173828125,
"learning_rate": 7.072230710241905e-05,
"loss": 2.1384,
"step": 3750
},
{
"epoch": 1.27,
"grad_norm": 0.1689453125,
"learning_rate": 7.043973920902729e-05,
"loss": 2.1216,
"step": 3755
},
{
"epoch": 1.27,
"grad_norm": 0.1640625,
"learning_rate": 7.015742970109317e-05,
"loss": 2.1304,
"step": 3760
},
{
"epoch": 1.28,
"grad_norm": 0.1630859375,
"learning_rate": 6.98753810462766e-05,
"loss": 2.1098,
"step": 3765
},
{
"epoch": 1.28,
"grad_norm": 0.16796875,
"learning_rate": 6.959359570995738e-05,
"loss": 2.127,
"step": 3770
},
{
"epoch": 1.28,
"grad_norm": 0.169921875,
"learning_rate": 6.931207615521366e-05,
"loss": 2.1041,
"step": 3775
},
{
"epoch": 1.28,
"grad_norm": 0.16796875,
"learning_rate": 6.903082484280053e-05,
"loss": 2.1141,
"step": 3780
},
{
"epoch": 1.28,
"grad_norm": 0.169921875,
"learning_rate": 6.874984423112819e-05,
"loss": 2.1364,
"step": 3785
},
{
"epoch": 1.28,
"grad_norm": 0.1708984375,
"learning_rate": 6.84691367762407e-05,
"loss": 2.1172,
"step": 3790
},
{
"epoch": 1.29,
"grad_norm": 0.16796875,
"learning_rate": 6.818870493179458e-05,
"loss": 2.0934,
"step": 3795
},
{
"epoch": 1.29,
"grad_norm": 0.1650390625,
"learning_rate": 6.790855114903714e-05,
"loss": 2.1207,
"step": 3800
},
{
"epoch": 1.29,
"grad_norm": 0.1640625,
"learning_rate": 6.762867787678512e-05,
"loss": 2.121,
"step": 3805
},
{
"epoch": 1.29,
"grad_norm": 0.1650390625,
"learning_rate": 6.73490875614035e-05,
"loss": 2.1111,
"step": 3810
},
{
"epoch": 1.29,
"grad_norm": 0.171875,
"learning_rate": 6.706978264678376e-05,
"loss": 2.1089,
"step": 3815
},
{
"epoch": 1.29,
"grad_norm": 0.1708984375,
"learning_rate": 6.679076557432278e-05,
"loss": 2.1221,
"step": 3820
},
{
"epoch": 1.3,
"grad_norm": 0.16796875,
"learning_rate": 6.651203878290139e-05,
"loss": 2.1238,
"step": 3825
},
{
"epoch": 1.3,
"grad_norm": 0.1650390625,
"learning_rate": 6.623360470886314e-05,
"loss": 2.1229,
"step": 3830
},
{
"epoch": 1.3,
"grad_norm": 0.166015625,
"learning_rate": 6.59554657859929e-05,
"loss": 2.0983,
"step": 3835
},
{
"epoch": 1.3,
"grad_norm": 0.16796875,
"learning_rate": 6.567762444549558e-05,
"loss": 2.1041,
"step": 3840
},
{
"epoch": 1.3,
"grad_norm": 0.1630859375,
"learning_rate": 6.540008311597507e-05,
"loss": 2.1202,
"step": 3845
},
{
"epoch": 1.3,
"grad_norm": 0.1669921875,
"learning_rate": 6.512284422341275e-05,
"loss": 2.0983,
"step": 3850
},
{
"epoch": 1.31,
"grad_norm": 0.166015625,
"learning_rate": 6.484591019114646e-05,
"loss": 2.1122,
"step": 3855
},
{
"epoch": 1.31,
"grad_norm": 0.1650390625,
"learning_rate": 6.456928343984919e-05,
"loss": 2.1393,
"step": 3860
},
{
"epoch": 1.31,
"grad_norm": 0.16796875,
"learning_rate": 6.429296638750814e-05,
"loss": 2.1091,
"step": 3865
},
{
"epoch": 1.31,
"grad_norm": 0.16796875,
"learning_rate": 6.401696144940332e-05,
"loss": 2.0942,
"step": 3870
},
{
"epoch": 1.31,
"grad_norm": 0.1669921875,
"learning_rate": 6.374127103808654e-05,
"loss": 2.1106,
"step": 3875
},
{
"epoch": 1.31,
"grad_norm": 0.1669921875,
"learning_rate": 6.34658975633605e-05,
"loss": 2.1063,
"step": 3880
},
{
"epoch": 1.32,
"grad_norm": 0.16796875,
"learning_rate": 6.319084343225738e-05,
"loss": 2.1309,
"step": 3885
},
{
"epoch": 1.32,
"grad_norm": 0.1689453125,
"learning_rate": 6.291611104901812e-05,
"loss": 2.1288,
"step": 3890
},
{
"epoch": 1.32,
"grad_norm": 0.1650390625,
"learning_rate": 6.264170281507111e-05,
"loss": 2.1197,
"step": 3895
},
{
"epoch": 1.32,
"grad_norm": 0.166015625,
"learning_rate": 6.236762112901158e-05,
"loss": 2.1203,
"step": 3900
},
{
"epoch": 1.32,
"grad_norm": 0.169921875,
"learning_rate": 6.209386838658024e-05,
"loss": 2.1263,
"step": 3905
},
{
"epoch": 1.32,
"grad_norm": 0.169921875,
"learning_rate": 6.182044698064256e-05,
"loss": 2.1084,
"step": 3910
},
{
"epoch": 1.33,
"grad_norm": 0.166015625,
"learning_rate": 6.154735930116786e-05,
"loss": 2.1207,
"step": 3915
},
{
"epoch": 1.33,
"grad_norm": 0.169921875,
"learning_rate": 6.12746077352083e-05,
"loss": 2.0995,
"step": 3920
},
{
"epoch": 1.33,
"grad_norm": 0.1650390625,
"learning_rate": 6.1002194666878106e-05,
"loss": 2.1247,
"step": 3925
},
{
"epoch": 1.33,
"grad_norm": 0.1669921875,
"learning_rate": 6.0730122477332675e-05,
"loss": 2.1024,
"step": 3930
},
{
"epoch": 1.33,
"grad_norm": 0.169921875,
"learning_rate": 6.045839354474786e-05,
"loss": 2.0901,
"step": 3935
},
{
"epoch": 1.33,
"grad_norm": 0.1630859375,
"learning_rate": 6.0187010244299046e-05,
"loss": 2.0994,
"step": 3940
},
{
"epoch": 1.34,
"grad_norm": 0.169921875,
"learning_rate": 5.9915974948140474e-05,
"loss": 2.1009,
"step": 3945
},
{
"epoch": 1.34,
"grad_norm": 0.1669921875,
"learning_rate": 5.964529002538455e-05,
"loss": 2.0828,
"step": 3950
},
{
"epoch": 1.34,
"grad_norm": 0.166015625,
"learning_rate": 5.937495784208096e-05,
"loss": 2.1322,
"step": 3955
},
{
"epoch": 1.34,
"grad_norm": 0.171875,
"learning_rate": 5.910498076119622e-05,
"loss": 2.1117,
"step": 3960
},
{
"epoch": 1.34,
"grad_norm": 0.1669921875,
"learning_rate": 5.883536114259277e-05,
"loss": 2.0814,
"step": 3965
},
{
"epoch": 1.34,
"grad_norm": 0.166015625,
"learning_rate": 5.8566101343008687e-05,
"loss": 2.103,
"step": 3970
},
{
"epoch": 1.35,
"grad_norm": 0.171875,
"learning_rate": 5.829720371603664e-05,
"loss": 2.1117,
"step": 3975
},
{
"epoch": 1.35,
"grad_norm": 0.1689453125,
"learning_rate": 5.802867061210375e-05,
"loss": 2.1094,
"step": 3980
},
{
"epoch": 1.35,
"grad_norm": 0.169921875,
"learning_rate": 5.776050437845075e-05,
"loss": 2.1009,
"step": 3985
},
{
"epoch": 1.35,
"grad_norm": 0.1708984375,
"learning_rate": 5.749270735911158e-05,
"loss": 2.1074,
"step": 3990
},
{
"epoch": 1.35,
"grad_norm": 0.16796875,
"learning_rate": 5.7225281894892935e-05,
"loss": 2.1046,
"step": 3995
},
{
"epoch": 1.35,
"grad_norm": 0.166015625,
"learning_rate": 5.695823032335366e-05,
"loss": 2.1119,
"step": 4000
},
{
"epoch": 1.36,
"grad_norm": 0.1689453125,
"learning_rate": 5.669155497878454e-05,
"loss": 2.1184,
"step": 4005
},
{
"epoch": 1.36,
"grad_norm": 0.1650390625,
"learning_rate": 5.642525819218769e-05,
"loss": 2.0827,
"step": 4010
},
{
"epoch": 1.36,
"grad_norm": 0.166015625,
"learning_rate": 5.6159342291256254e-05,
"loss": 2.0952,
"step": 4015
},
{
"epoch": 1.36,
"grad_norm": 0.16796875,
"learning_rate": 5.589380960035417e-05,
"loss": 2.1268,
"step": 4020
},
{
"epoch": 1.36,
"grad_norm": 0.1640625,
"learning_rate": 5.562866244049557e-05,
"loss": 2.1282,
"step": 4025
},
{
"epoch": 1.36,
"grad_norm": 0.16796875,
"learning_rate": 5.53639031293248e-05,
"loss": 2.1115,
"step": 4030
},
{
"epoch": 1.37,
"grad_norm": 0.1728515625,
"learning_rate": 5.509953398109594e-05,
"loss": 2.1172,
"step": 4035
},
{
"epoch": 1.37,
"grad_norm": 0.166015625,
"learning_rate": 5.483555730665282e-05,
"loss": 2.0858,
"step": 4040
},
{
"epoch": 1.37,
"grad_norm": 0.16796875,
"learning_rate": 5.457197541340853e-05,
"loss": 2.1138,
"step": 4045
},
{
"epoch": 1.37,
"grad_norm": 0.1669921875,
"learning_rate": 5.4308790605325364e-05,
"loss": 2.1407,
"step": 4050
},
{
"epoch": 1.37,
"grad_norm": 0.166015625,
"learning_rate": 5.404600518289487e-05,
"loss": 2.1155,
"step": 4055
},
{
"epoch": 1.38,
"grad_norm": 0.166015625,
"learning_rate": 5.3783621443117414e-05,
"loss": 2.1038,
"step": 4060
},
{
"epoch": 1.38,
"grad_norm": 0.1650390625,
"learning_rate": 5.352164167948233e-05,
"loss": 2.1022,
"step": 4065
},
{
"epoch": 1.38,
"grad_norm": 0.16796875,
"learning_rate": 5.326006818194782e-05,
"loss": 2.1037,
"step": 4070
},
{
"epoch": 1.38,
"grad_norm": 0.1669921875,
"learning_rate": 5.2998903236920895e-05,
"loss": 2.1187,
"step": 4075
},
{
"epoch": 1.38,
"grad_norm": 0.171875,
"learning_rate": 5.273814912723742e-05,
"loss": 2.1133,
"step": 4080
},
{
"epoch": 1.38,
"grad_norm": 0.169921875,
"learning_rate": 5.247780813214214e-05,
"loss": 2.1261,
"step": 4085
},
{
"epoch": 1.39,
"grad_norm": 0.171875,
"learning_rate": 5.221788252726889e-05,
"loss": 2.1114,
"step": 4090
},
{
"epoch": 1.39,
"grad_norm": 0.16796875,
"learning_rate": 5.195837458462045e-05,
"loss": 2.112,
"step": 4095
},
{
"epoch": 1.39,
"grad_norm": 0.16796875,
"learning_rate": 5.16992865725489e-05,
"loss": 2.125,
"step": 4100
},
{
"epoch": 1.39,
"grad_norm": 0.1669921875,
"learning_rate": 5.14406207557357e-05,
"loss": 2.0831,
"step": 4105
},
{
"epoch": 1.39,
"grad_norm": 0.1708984375,
"learning_rate": 5.11823793951719e-05,
"loss": 2.1304,
"step": 4110
},
{
"epoch": 1.39,
"grad_norm": 0.1748046875,
"learning_rate": 5.092456474813841e-05,
"loss": 2.1066,
"step": 4115
},
{
"epoch": 1.4,
"grad_norm": 0.1640625,
"learning_rate": 5.066717906818618e-05,
"loss": 2.1207,
"step": 4120
},
{
"epoch": 1.4,
"grad_norm": 0.1611328125,
"learning_rate": 5.041022460511673e-05,
"loss": 2.1132,
"step": 4125
},
{
"epoch": 1.4,
"grad_norm": 0.169921875,
"learning_rate": 5.015370360496219e-05,
"loss": 2.1384,
"step": 4130
},
{
"epoch": 1.4,
"grad_norm": 0.166015625,
"learning_rate": 4.989761830996581e-05,
"loss": 2.1042,
"step": 4135
},
{
"epoch": 1.4,
"grad_norm": 0.1640625,
"learning_rate": 4.9641970958562366e-05,
"loss": 2.1241,
"step": 4140
},
{
"epoch": 1.4,
"grad_norm": 0.171875,
"learning_rate": 4.938676378535866e-05,
"loss": 2.1237,
"step": 4145
},
{
"epoch": 1.41,
"grad_norm": 0.1787109375,
"learning_rate": 4.913199902111385e-05,
"loss": 2.1187,
"step": 4150
},
{
"epoch": 1.41,
"grad_norm": 0.166015625,
"learning_rate": 4.8877678892719866e-05,
"loss": 2.0729,
"step": 4155
},
{
"epoch": 1.41,
"grad_norm": 0.171875,
"learning_rate": 4.862380562318236e-05,
"loss": 2.1086,
"step": 4160
},
{
"epoch": 1.41,
"grad_norm": 0.16796875,
"learning_rate": 4.837038143160082e-05,
"loss": 2.118,
"step": 4165
},
{
"epoch": 1.41,
"grad_norm": 0.166015625,
"learning_rate": 4.811740853314939e-05,
"loss": 2.1179,
"step": 4170
},
{
"epoch": 1.41,
"grad_norm": 0.169921875,
"learning_rate": 4.786488913905745e-05,
"loss": 2.1102,
"step": 4175
},
{
"epoch": 1.42,
"grad_norm": 0.169921875,
"learning_rate": 4.7612825456590435e-05,
"loss": 2.0915,
"step": 4180
},
{
"epoch": 1.42,
"grad_norm": 0.16796875,
"learning_rate": 4.736121968903027e-05,
"loss": 2.1319,
"step": 4185
},
{
"epoch": 1.42,
"grad_norm": 0.1669921875,
"learning_rate": 4.7110074035656316e-05,
"loss": 2.0898,
"step": 4190
},
{
"epoch": 1.42,
"grad_norm": 0.16796875,
"learning_rate": 4.685939069172609e-05,
"loss": 2.1212,
"step": 4195
},
{
"epoch": 1.42,
"grad_norm": 0.1669921875,
"learning_rate": 4.6609171848456066e-05,
"loss": 2.0967,
"step": 4200
},
{
"epoch": 1.42,
"grad_norm": 0.1669921875,
"learning_rate": 4.6359419693002534e-05,
"loss": 2.095,
"step": 4205
},
{
"epoch": 1.43,
"grad_norm": 0.1728515625,
"learning_rate": 4.611013640844245e-05,
"loss": 2.1181,
"step": 4210
},
{
"epoch": 1.43,
"grad_norm": 0.1689453125,
"learning_rate": 4.5861324173754484e-05,
"loss": 2.1135,
"step": 4215
},
{
"epoch": 1.43,
"grad_norm": 0.1689453125,
"learning_rate": 4.561298516379974e-05,
"loss": 2.1027,
"step": 4220
},
{
"epoch": 1.43,
"grad_norm": 0.166015625,
"learning_rate": 4.5365121549302914e-05,
"loss": 2.094,
"step": 4225
},
{
"epoch": 1.43,
"grad_norm": 0.169921875,
"learning_rate": 4.5117735496833415e-05,
"loss": 2.1279,
"step": 4230
},
{
"epoch": 1.43,
"grad_norm": 0.166015625,
"learning_rate": 4.487082916878606e-05,
"loss": 2.1079,
"step": 4235
},
{
"epoch": 1.44,
"grad_norm": 0.1650390625,
"learning_rate": 4.4624404723362576e-05,
"loss": 2.1143,
"step": 4240
},
{
"epoch": 1.44,
"grad_norm": 0.1650390625,
"learning_rate": 4.437846431455249e-05,
"loss": 2.0944,
"step": 4245
},
{
"epoch": 1.44,
"grad_norm": 0.171875,
"learning_rate": 4.4133010092114494e-05,
"loss": 2.1205,
"step": 4250
},
{
"epoch": 1.44,
"grad_norm": 0.166015625,
"learning_rate": 4.3888044201557376e-05,
"loss": 2.1112,
"step": 4255
},
{
"epoch": 1.44,
"grad_norm": 0.1708984375,
"learning_rate": 4.36435687841215e-05,
"loss": 2.0874,
"step": 4260
},
{
"epoch": 1.44,
"grad_norm": 0.1728515625,
"learning_rate": 4.3399585976760105e-05,
"loss": 2.1054,
"step": 4265
},
{
"epoch": 1.45,
"grad_norm": 0.169921875,
"learning_rate": 4.3156097912120385e-05,
"loss": 2.1011,
"step": 4270
},
{
"epoch": 1.45,
"grad_norm": 0.173828125,
"learning_rate": 4.29131067185251e-05,
"loss": 2.1303,
"step": 4275
},
{
"epoch": 1.45,
"grad_norm": 0.1689453125,
"learning_rate": 4.2670614519953834e-05,
"loss": 2.0863,
"step": 4280
},
{
"epoch": 1.45,
"grad_norm": 0.169921875,
"learning_rate": 4.242862343602447e-05,
"loss": 2.0792,
"step": 4285
},
{
"epoch": 1.45,
"grad_norm": 0.1689453125,
"learning_rate": 4.21871355819747e-05,
"loss": 2.1121,
"step": 4290
},
{
"epoch": 1.45,
"grad_norm": 0.1708984375,
"learning_rate": 4.19461530686434e-05,
"loss": 2.1082,
"step": 4295
},
{
"epoch": 1.46,
"grad_norm": 0.177734375,
"learning_rate": 4.170567800245244e-05,
"loss": 2.1194,
"step": 4300
},
{
"epoch": 1.46,
"grad_norm": 0.169921875,
"learning_rate": 4.1465712485387966e-05,
"loss": 2.1103,
"step": 4305
},
{
"epoch": 1.46,
"grad_norm": 0.169921875,
"learning_rate": 4.1226258614982214e-05,
"loss": 2.1125,
"step": 4310
},
{
"epoch": 1.46,
"grad_norm": 0.1708984375,
"learning_rate": 4.0987318484295135e-05,
"loss": 2.1029,
"step": 4315
},
{
"epoch": 1.46,
"grad_norm": 0.1748046875,
"learning_rate": 4.074889418189608e-05,
"loss": 2.1172,
"step": 4320
},
{
"epoch": 1.46,
"grad_norm": 0.1640625,
"learning_rate": 4.051098779184559e-05,
"loss": 2.0825,
"step": 4325
},
{
"epoch": 1.47,
"grad_norm": 0.1640625,
"learning_rate": 4.0273601393677064e-05,
"loss": 2.1349,
"step": 4330
},
{
"epoch": 1.47,
"grad_norm": 0.171875,
"learning_rate": 4.0036737062378823e-05,
"loss": 2.1057,
"step": 4335
},
{
"epoch": 1.47,
"grad_norm": 0.16796875,
"learning_rate": 3.980039686837568e-05,
"loss": 2.1363,
"step": 4340
},
{
"epoch": 1.47,
"grad_norm": 0.171875,
"learning_rate": 3.956458287751097e-05,
"loss": 2.1177,
"step": 4345
},
{
"epoch": 1.47,
"grad_norm": 0.16796875,
"learning_rate": 3.932929715102863e-05,
"loss": 2.0994,
"step": 4350
},
{
"epoch": 1.48,
"grad_norm": 0.1728515625,
"learning_rate": 3.9094541745554946e-05,
"loss": 2.0962,
"step": 4355
},
{
"epoch": 1.48,
"grad_norm": 0.171875,
"learning_rate": 3.8860318713080725e-05,
"loss": 2.1128,
"step": 4360
},
{
"epoch": 1.48,
"grad_norm": 0.16796875,
"learning_rate": 3.8626630100943196e-05,
"loss": 2.1146,
"step": 4365
},
{
"epoch": 1.48,
"grad_norm": 0.169921875,
"learning_rate": 3.8393477951808444e-05,
"loss": 2.1022,
"step": 4370
},
{
"epoch": 1.48,
"grad_norm": 0.16796875,
"learning_rate": 3.816086430365321e-05,
"loss": 2.1184,
"step": 4375
},
{
"epoch": 1.48,
"grad_norm": 0.1669921875,
"learning_rate": 3.7928791189747195e-05,
"loss": 2.1077,
"step": 4380
},
{
"epoch": 1.49,
"grad_norm": 0.16796875,
"learning_rate": 3.769726063863541e-05,
"loss": 2.1133,
"step": 4385
},
{
"epoch": 1.49,
"grad_norm": 0.1650390625,
"learning_rate": 3.746627467412026e-05,
"loss": 2.1012,
"step": 4390
},
{
"epoch": 1.49,
"grad_norm": 0.16796875,
"learning_rate": 3.723583531524394e-05,
"loss": 2.1151,
"step": 4395
},
{
"epoch": 1.49,
"grad_norm": 0.16796875,
"learning_rate": 3.700594457627079e-05,
"loss": 2.1374,
"step": 4400
},
{
"epoch": 1.49,
"grad_norm": 0.16796875,
"learning_rate": 3.6776604466669686e-05,
"loss": 2.1082,
"step": 4405
},
{
"epoch": 1.49,
"grad_norm": 0.169921875,
"learning_rate": 3.654781699109645e-05,
"loss": 2.1171,
"step": 4410
},
{
"epoch": 1.5,
"grad_norm": 0.16796875,
"learning_rate": 3.631958414937633e-05,
"loss": 2.1057,
"step": 4415
},
{
"epoch": 1.5,
"grad_norm": 0.166015625,
"learning_rate": 3.609190793648661e-05,
"loss": 2.1366,
"step": 4420
},
{
"epoch": 1.5,
"grad_norm": 0.171875,
"learning_rate": 3.586479034253902e-05,
"loss": 2.116,
"step": 4425
},
{
"epoch": 1.5,
"grad_norm": 0.1689453125,
"learning_rate": 3.563823335276244e-05,
"loss": 2.1004,
"step": 4430
},
{
"epoch": 1.5,
"grad_norm": 0.1689453125,
"learning_rate": 3.541223894748553e-05,
"loss": 2.0959,
"step": 4435
},
{
"epoch": 1.5,
"grad_norm": 0.1669921875,
"learning_rate": 3.51868091021194e-05,
"loss": 2.1085,
"step": 4440
},
{
"epoch": 1.51,
"grad_norm": 0.1689453125,
"learning_rate": 3.496194578714036e-05,
"loss": 2.1207,
"step": 4445
},
{
"epoch": 1.51,
"grad_norm": 0.1689453125,
"learning_rate": 3.473765096807269e-05,
"loss": 2.1179,
"step": 4450
},
{
"epoch": 1.51,
"grad_norm": 0.166015625,
"learning_rate": 3.45139266054715e-05,
"loss": 2.1068,
"step": 4455
},
{
"epoch": 1.51,
"grad_norm": 0.16796875,
"learning_rate": 3.429077465490551e-05,
"loss": 2.1213,
"step": 4460
},
{
"epoch": 1.51,
"grad_norm": 0.1728515625,
"learning_rate": 3.406819706694003e-05,
"loss": 2.0988,
"step": 4465
},
{
"epoch": 1.51,
"grad_norm": 0.1708984375,
"learning_rate": 3.3846195787119814e-05,
"loss": 2.0996,
"step": 4470
},
{
"epoch": 1.52,
"grad_norm": 0.1689453125,
"learning_rate": 3.362477275595225e-05,
"loss": 2.0933,
"step": 4475
},
{
"epoch": 1.52,
"grad_norm": 0.171875,
"learning_rate": 3.340392990889018e-05,
"loss": 2.0977,
"step": 4480
},
{
"epoch": 1.52,
"grad_norm": 0.173828125,
"learning_rate": 3.3183669176315045e-05,
"loss": 2.1335,
"step": 4485
},
{
"epoch": 1.52,
"grad_norm": 0.166015625,
"learning_rate": 3.296399248352012e-05,
"loss": 2.0964,
"step": 4490
},
{
"epoch": 1.52,
"grad_norm": 0.16796875,
"learning_rate": 3.2744901750693556e-05,
"loss": 2.0952,
"step": 4495
},
{
"epoch": 1.52,
"grad_norm": 0.169921875,
"learning_rate": 3.2526398892901654e-05,
"loss": 2.0881,
"step": 4500
},
{
"epoch": 1.53,
"grad_norm": 0.169921875,
"learning_rate": 3.2308485820072075e-05,
"loss": 2.1046,
"step": 4505
},
{
"epoch": 1.53,
"grad_norm": 0.1669921875,
"learning_rate": 3.2091164436977294e-05,
"loss": 2.1092,
"step": 4510
},
{
"epoch": 1.53,
"grad_norm": 0.1650390625,
"learning_rate": 3.187443664321773e-05,
"loss": 2.0628,
"step": 4515
},
{
"epoch": 1.53,
"grad_norm": 0.16796875,
"learning_rate": 3.165830433320531e-05,
"loss": 2.1031,
"step": 4520
},
{
"epoch": 1.53,
"grad_norm": 0.162109375,
"learning_rate": 3.144276939614683e-05,
"loss": 2.1297,
"step": 4525
},
{
"epoch": 1.53,
"grad_norm": 0.16796875,
"learning_rate": 3.122783371602747e-05,
"loss": 2.1324,
"step": 4530
},
{
"epoch": 1.54,
"grad_norm": 0.166015625,
"learning_rate": 3.101349917159433e-05,
"loss": 2.1365,
"step": 4535
},
{
"epoch": 1.54,
"grad_norm": 0.16796875,
"learning_rate": 3.079976763633996e-05,
"loss": 2.1137,
"step": 4540
},
{
"epoch": 1.54,
"grad_norm": 0.16796875,
"learning_rate": 3.058664097848612e-05,
"loss": 2.1106,
"step": 4545
},
{
"epoch": 1.54,
"grad_norm": 0.171875,
"learning_rate": 3.0374121060967255e-05,
"loss": 2.1024,
"step": 4550
},
{
"epoch": 1.54,
"grad_norm": 0.1669921875,
"learning_rate": 3.0162209741414304e-05,
"loss": 2.1043,
"step": 4555
},
{
"epoch": 1.54,
"grad_norm": 0.1708984375,
"learning_rate": 2.9950908872138584e-05,
"loss": 2.0755,
"step": 4560
},
{
"epoch": 1.55,
"grad_norm": 0.1669921875,
"learning_rate": 2.9740220300115386e-05,
"loss": 2.1192,
"step": 4565
},
{
"epoch": 1.55,
"grad_norm": 0.1708984375,
"learning_rate": 2.9530145866967895e-05,
"loss": 2.1415,
"step": 4570
},
{
"epoch": 1.55,
"grad_norm": 0.1650390625,
"learning_rate": 2.9320687408951162e-05,
"loss": 2.0845,
"step": 4575
},
{
"epoch": 1.55,
"grad_norm": 0.166015625,
"learning_rate": 2.9111846756936113e-05,
"loss": 2.1182,
"step": 4580
},
{
"epoch": 1.55,
"grad_norm": 0.1669921875,
"learning_rate": 2.8903625736393304e-05,
"loss": 2.0882,
"step": 4585
},
{
"epoch": 1.55,
"grad_norm": 0.1689453125,
"learning_rate": 2.8696026167377155e-05,
"loss": 2.1499,
"step": 4590
},
{
"epoch": 1.56,
"grad_norm": 0.17578125,
"learning_rate": 2.8489049864510054e-05,
"loss": 2.1045,
"step": 4595
},
{
"epoch": 1.56,
"grad_norm": 0.1708984375,
"learning_rate": 2.8282698636966375e-05,
"loss": 2.1391,
"step": 4600
},
{
"epoch": 1.56,
"grad_norm": 0.169921875,
"learning_rate": 2.8076974288456726e-05,
"loss": 2.0716,
"step": 4605
},
{
"epoch": 1.56,
"grad_norm": 0.171875,
"learning_rate": 2.78718786172122e-05,
"loss": 2.1046,
"step": 4610
},
{
"epoch": 1.56,
"grad_norm": 0.169921875,
"learning_rate": 2.7667413415968635e-05,
"loss": 2.0917,
"step": 4615
},
{
"epoch": 1.56,
"grad_norm": 0.1689453125,
"learning_rate": 2.7463580471950943e-05,
"loss": 2.1355,
"step": 4620
},
{
"epoch": 1.57,
"grad_norm": 0.1650390625,
"learning_rate": 2.7260381566857473e-05,
"loss": 2.0997,
"step": 4625
},
{
"epoch": 1.57,
"grad_norm": 0.166015625,
"learning_rate": 2.7057818476844533e-05,
"loss": 2.0962,
"step": 4630
},
{
"epoch": 1.57,
"grad_norm": 0.16796875,
"learning_rate": 2.68558929725107e-05,
"loss": 2.095,
"step": 4635
},
{
"epoch": 1.57,
"grad_norm": 0.166015625,
"learning_rate": 2.6654606818881465e-05,
"loss": 2.1209,
"step": 4640
},
{
"epoch": 1.57,
"grad_norm": 0.1640625,
"learning_rate": 2.645396177539379e-05,
"loss": 2.1147,
"step": 4645
},
{
"epoch": 1.57,
"grad_norm": 0.1650390625,
"learning_rate": 2.6253959595880673e-05,
"loss": 2.1074,
"step": 4650
},
{
"epoch": 1.58,
"grad_norm": 0.16796875,
"learning_rate": 2.6054602028555885e-05,
"loss": 2.1129,
"step": 4655
},
{
"epoch": 1.58,
"grad_norm": 0.16796875,
"learning_rate": 2.585589081599862e-05,
"loss": 2.1113,
"step": 4660
},
{
"epoch": 1.58,
"grad_norm": 0.1689453125,
"learning_rate": 2.5657827695138372e-05,
"loss": 2.1018,
"step": 4665
},
{
"epoch": 1.58,
"grad_norm": 0.177734375,
"learning_rate": 2.546041439723963e-05,
"loss": 2.1268,
"step": 4670
},
{
"epoch": 1.58,
"grad_norm": 0.16796875,
"learning_rate": 2.5263652647886803e-05,
"loss": 2.115,
"step": 4675
},
{
"epoch": 1.59,
"grad_norm": 0.1669921875,
"learning_rate": 2.5067544166969114e-05,
"loss": 2.1177,
"step": 4680
},
{
"epoch": 1.59,
"grad_norm": 0.169921875,
"learning_rate": 2.487209066866565e-05,
"loss": 2.0815,
"step": 4685
},
{
"epoch": 1.59,
"grad_norm": 0.16796875,
"learning_rate": 2.467729386143025e-05,
"loss": 2.106,
"step": 4690
},
{
"epoch": 1.59,
"grad_norm": 0.1689453125,
"learning_rate": 2.4483155447976657e-05,
"loss": 2.1241,
"step": 4695
},
{
"epoch": 1.59,
"grad_norm": 0.1708984375,
"learning_rate": 2.42896771252636e-05,
"loss": 2.0945,
"step": 4700
},
{
"epoch": 1.59,
"grad_norm": 0.1669921875,
"learning_rate": 2.4096860584479974e-05,
"loss": 2.1349,
"step": 4705
},
{
"epoch": 1.6,
"grad_norm": 0.16796875,
"learning_rate": 2.390470751103008e-05,
"loss": 2.0902,
"step": 4710
},
{
"epoch": 1.6,
"grad_norm": 0.1669921875,
"learning_rate": 2.37132195845188e-05,
"loss": 2.1333,
"step": 4715
},
{
"epoch": 1.6,
"grad_norm": 0.16796875,
"learning_rate": 2.3522398478737108e-05,
"loss": 2.0972,
"step": 4720
},
{
"epoch": 1.6,
"grad_norm": 0.1689453125,
"learning_rate": 2.3332245861647206e-05,
"loss": 2.087,
"step": 4725
},
{
"epoch": 1.6,
"grad_norm": 0.1728515625,
"learning_rate": 2.3142763395368095e-05,
"loss": 2.0908,
"step": 4730
},
{
"epoch": 1.6,
"grad_norm": 0.1669921875,
"learning_rate": 2.295395273616099e-05,
"loss": 2.1023,
"step": 4735
},
{
"epoch": 1.61,
"grad_norm": 0.171875,
"learning_rate": 2.2765815534414868e-05,
"loss": 2.0931,
"step": 4740
},
{
"epoch": 1.61,
"grad_norm": 0.1630859375,
"learning_rate": 2.257835343463205e-05,
"loss": 2.0842,
"step": 4745
},
{
"epoch": 1.61,
"grad_norm": 0.1728515625,
"learning_rate": 2.239156807541375e-05,
"loss": 2.1044,
"step": 4750
},
{
"epoch": 1.61,
"grad_norm": 0.173828125,
"learning_rate": 2.22054610894459e-05,
"loss": 2.1367,
"step": 4755
},
{
"epoch": 1.61,
"grad_norm": 0.1806640625,
"learning_rate": 2.202003410348473e-05,
"loss": 2.0934,
"step": 4760
},
{
"epoch": 1.61,
"grad_norm": 0.1669921875,
"learning_rate": 2.1835288738342596e-05,
"loss": 2.1331,
"step": 4765
},
{
"epoch": 1.62,
"grad_norm": 0.169921875,
"learning_rate": 2.1651226608873877e-05,
"loss": 2.0986,
"step": 4770
},
{
"epoch": 1.62,
"grad_norm": 0.16796875,
"learning_rate": 2.1467849323960797e-05,
"loss": 2.1361,
"step": 4775
},
{
"epoch": 1.62,
"grad_norm": 0.1748046875,
"learning_rate": 2.128515848649929e-05,
"loss": 2.1242,
"step": 4780
},
{
"epoch": 1.62,
"grad_norm": 0.16796875,
"learning_rate": 2.1103155693385136e-05,
"loss": 2.137,
"step": 4785
},
{
"epoch": 1.62,
"grad_norm": 0.1748046875,
"learning_rate": 2.092184253549998e-05,
"loss": 2.1203,
"step": 4790
},
{
"epoch": 1.62,
"grad_norm": 0.169921875,
"learning_rate": 2.074122059769733e-05,
"loss": 2.08,
"step": 4795
},
{
"epoch": 1.63,
"grad_norm": 0.16796875,
"learning_rate": 2.0561291458788733e-05,
"loss": 2.1012,
"step": 4800
},
{
"epoch": 1.63,
"grad_norm": 0.171875,
"learning_rate": 2.0382056691530084e-05,
"loss": 2.138,
"step": 4805
},
{
"epoch": 1.63,
"grad_norm": 0.16796875,
"learning_rate": 2.02035178626077e-05,
"loss": 2.1155,
"step": 4810
},
{
"epoch": 1.63,
"grad_norm": 0.166015625,
"learning_rate": 2.002567653262479e-05,
"loss": 2.0964,
"step": 4815
},
{
"epoch": 1.63,
"grad_norm": 0.1640625,
"learning_rate": 1.984853425608769e-05,
"loss": 2.1355,
"step": 4820
},
{
"epoch": 1.63,
"grad_norm": 0.169921875,
"learning_rate": 1.9672092581392375e-05,
"loss": 2.1272,
"step": 4825
},
{
"epoch": 1.64,
"grad_norm": 0.171875,
"learning_rate": 1.9496353050810843e-05,
"loss": 2.1183,
"step": 4830
},
{
"epoch": 1.64,
"grad_norm": 0.1630859375,
"learning_rate": 1.9321317200477653e-05,
"loss": 2.1447,
"step": 4835
},
{
"epoch": 1.64,
"grad_norm": 0.16796875,
"learning_rate": 1.91469865603766e-05,
"loss": 2.0945,
"step": 4840
},
{
"epoch": 1.64,
"grad_norm": 0.16796875,
"learning_rate": 1.8973362654327175e-05,
"loss": 2.117,
"step": 4845
},
{
"epoch": 1.64,
"grad_norm": 0.166015625,
"learning_rate": 1.8800446999971346e-05,
"loss": 2.1077,
"step": 4850
},
{
"epoch": 1.64,
"grad_norm": 0.1767578125,
"learning_rate": 1.8628241108760268e-05,
"loss": 2.0966,
"step": 4855
},
{
"epoch": 1.65,
"grad_norm": 0.1708984375,
"learning_rate": 1.845674648594108e-05,
"loss": 2.1312,
"step": 4860
},
{
"epoch": 1.65,
"grad_norm": 0.1669921875,
"learning_rate": 1.828596463054375e-05,
"loss": 2.0882,
"step": 4865
},
{
"epoch": 1.65,
"grad_norm": 0.1689453125,
"learning_rate": 1.8115897035367934e-05,
"loss": 2.0857,
"step": 4870
},
{
"epoch": 1.65,
"grad_norm": 0.1689453125,
"learning_rate": 1.7946545186970022e-05,
"loss": 2.1279,
"step": 4875
},
{
"epoch": 1.65,
"grad_norm": 0.1708984375,
"learning_rate": 1.7777910565650024e-05,
"loss": 2.0998,
"step": 4880
},
{
"epoch": 1.65,
"grad_norm": 0.169921875,
"learning_rate": 1.760999464543869e-05,
"loss": 2.0973,
"step": 4885
},
{
"epoch": 1.66,
"grad_norm": 0.1728515625,
"learning_rate": 1.7442798894084655e-05,
"loss": 2.1296,
"step": 4890
},
{
"epoch": 1.66,
"grad_norm": 0.1689453125,
"learning_rate": 1.7276324773041565e-05,
"loss": 2.1325,
"step": 4895
},
{
"epoch": 1.66,
"grad_norm": 0.171875,
"learning_rate": 1.7110573737455295e-05,
"loss": 2.1089,
"step": 4900
},
{
"epoch": 1.66,
"grad_norm": 0.1669921875,
"learning_rate": 1.694554723615126e-05,
"loss": 2.1011,
"step": 4905
},
{
"epoch": 1.66,
"grad_norm": 0.16796875,
"learning_rate": 1.6781246711621744e-05,
"loss": 2.1267,
"step": 4910
},
{
"epoch": 1.66,
"grad_norm": 0.1650390625,
"learning_rate": 1.6617673600013296e-05,
"loss": 2.1263,
"step": 4915
},
{
"epoch": 1.67,
"grad_norm": 0.1689453125,
"learning_rate": 1.645482933111412e-05,
"loss": 2.0923,
"step": 4920
},
{
"epoch": 1.67,
"grad_norm": 0.16796875,
"learning_rate": 1.6292715328341712e-05,
"loss": 2.0923,
"step": 4925
},
{
"epoch": 1.67,
"grad_norm": 0.1728515625,
"learning_rate": 1.6131333008730277e-05,
"loss": 2.1355,
"step": 4930
},
{
"epoch": 1.67,
"grad_norm": 0.1728515625,
"learning_rate": 1.5970683782918374e-05,
"loss": 2.108,
"step": 4935
},
{
"epoch": 1.67,
"grad_norm": 0.16796875,
"learning_rate": 1.5810769055136644e-05,
"loss": 2.1115,
"step": 4940
},
{
"epoch": 1.67,
"grad_norm": 0.171875,
"learning_rate": 1.56515902231955e-05,
"loss": 2.1023,
"step": 4945
},
{
"epoch": 1.68,
"grad_norm": 0.169921875,
"learning_rate": 1.5493148678472903e-05,
"loss": 2.1269,
"step": 4950
},
{
"epoch": 1.68,
"grad_norm": 0.171875,
"learning_rate": 1.533544580590217e-05,
"loss": 2.1085,
"step": 4955
},
{
"epoch": 1.68,
"grad_norm": 0.1728515625,
"learning_rate": 1.5178482983959985e-05,
"loss": 2.0965,
"step": 4960
},
{
"epoch": 1.68,
"grad_norm": 0.169921875,
"learning_rate": 1.5022261584654207e-05,
"loss": 2.129,
"step": 4965
},
{
"epoch": 1.68,
"grad_norm": 0.1669921875,
"learning_rate": 1.4866782973511962e-05,
"loss": 2.0724,
"step": 4970
},
{
"epoch": 1.69,
"grad_norm": 0.16796875,
"learning_rate": 1.4712048509567634e-05,
"loss": 2.1429,
"step": 4975
},
{
"epoch": 1.69,
"grad_norm": 0.16796875,
"learning_rate": 1.4558059545351143e-05,
"loss": 2.1328,
"step": 4980
},
{
"epoch": 1.69,
"grad_norm": 0.16796875,
"learning_rate": 1.4404817426875938e-05,
"loss": 2.0847,
"step": 4985
},
{
"epoch": 1.69,
"grad_norm": 0.1689453125,
"learning_rate": 1.4252323493627251e-05,
"loss": 2.1036,
"step": 4990
},
{
"epoch": 1.69,
"grad_norm": 0.169921875,
"learning_rate": 1.4100579078550613e-05,
"loss": 2.09,
"step": 4995
},
{
"epoch": 1.69,
"grad_norm": 0.1748046875,
"learning_rate": 1.3949585508039886e-05,
"loss": 2.1229,
"step": 5000
},
{
"epoch": 1.7,
"grad_norm": 0.1669921875,
"learning_rate": 1.3799344101925904e-05,
"loss": 2.1345,
"step": 5005
},
{
"epoch": 1.7,
"grad_norm": 0.177734375,
"learning_rate": 1.3649856173464781e-05,
"loss": 2.1066,
"step": 5010
},
{
"epoch": 1.7,
"grad_norm": 0.1669921875,
"learning_rate": 1.3501123029326601e-05,
"loss": 2.1274,
"step": 5015
},
{
"epoch": 1.7,
"grad_norm": 0.169921875,
"learning_rate": 1.3353145969583813e-05,
"loss": 2.1117,
"step": 5020
},
{
"epoch": 1.7,
"grad_norm": 0.1669921875,
"learning_rate": 1.3205926287699988e-05,
"loss": 2.1139,
"step": 5025
},
{
"epoch": 1.7,
"grad_norm": 0.1728515625,
"learning_rate": 1.3059465270518468e-05,
"loss": 2.113,
"step": 5030
},
{
"epoch": 1.71,
"grad_norm": 0.16796875,
"learning_rate": 1.2913764198251132e-05,
"loss": 2.1568,
"step": 5035
},
{
"epoch": 1.71,
"grad_norm": 0.169921875,
"learning_rate": 1.276882434446719e-05,
"loss": 2.1105,
"step": 5040
},
{
"epoch": 1.71,
"grad_norm": 0.169921875,
"learning_rate": 1.2624646976082066e-05,
"loss": 2.1108,
"step": 5045
},
{
"epoch": 1.71,
"grad_norm": 0.16796875,
"learning_rate": 1.2481233353346344e-05,
"loss": 2.1027,
"step": 5050
},
{
"epoch": 1.71,
"grad_norm": 0.1689453125,
"learning_rate": 1.2338584729834701e-05,
"loss": 2.0852,
"step": 5055
},
{
"epoch": 1.71,
"grad_norm": 0.16796875,
"learning_rate": 1.2196702352434953e-05,
"loss": 2.1189,
"step": 5060
},
{
"epoch": 1.72,
"grad_norm": 0.169921875,
"learning_rate": 1.205558746133727e-05,
"loss": 2.1078,
"step": 5065
},
{
"epoch": 1.72,
"grad_norm": 0.16796875,
"learning_rate": 1.1915241290023115e-05,
"loss": 2.097,
"step": 5070
},
{
"epoch": 1.72,
"grad_norm": 0.1728515625,
"learning_rate": 1.1775665065254704e-05,
"loss": 2.1288,
"step": 5075
},
{
"epoch": 1.72,
"grad_norm": 0.169921875,
"learning_rate": 1.1636860007064076e-05,
"loss": 2.082,
"step": 5080
},
{
"epoch": 1.72,
"grad_norm": 0.171875,
"learning_rate": 1.1498827328742623e-05,
"loss": 2.1079,
"step": 5085
},
{
"epoch": 1.72,
"grad_norm": 0.1689453125,
"learning_rate": 1.1361568236830323e-05,
"loss": 2.1292,
"step": 5090
},
{
"epoch": 1.73,
"grad_norm": 0.166015625,
"learning_rate": 1.122508393110524e-05,
"loss": 2.1246,
"step": 5095
},
{
"epoch": 1.73,
"grad_norm": 0.16796875,
"learning_rate": 1.1089375604573116e-05,
"loss": 2.0824,
"step": 5100
},
{
"epoch": 1.73,
"grad_norm": 0.1708984375,
"learning_rate": 1.0954444443456824e-05,
"loss": 2.0788,
"step": 5105
},
{
"epoch": 1.73,
"grad_norm": 0.169921875,
"learning_rate": 1.0820291627186107e-05,
"loss": 2.106,
"step": 5110
},
{
"epoch": 1.73,
"grad_norm": 0.1640625,
"learning_rate": 1.0686918328387118e-05,
"loss": 2.1059,
"step": 5115
},
{
"epoch": 1.73,
"grad_norm": 0.16796875,
"learning_rate": 1.0554325712872381e-05,
"loss": 2.1064,
"step": 5120
},
{
"epoch": 1.74,
"grad_norm": 0.173828125,
"learning_rate": 1.0422514939630424e-05,
"loss": 2.1087,
"step": 5125
},
{
"epoch": 1.74,
"grad_norm": 0.1708984375,
"learning_rate": 1.0291487160815726e-05,
"loss": 2.1372,
"step": 5130
},
{
"epoch": 1.74,
"grad_norm": 0.1669921875,
"learning_rate": 1.0161243521738661e-05,
"loss": 2.1257,
"step": 5135
},
{
"epoch": 1.74,
"grad_norm": 0.1650390625,
"learning_rate": 1.003178516085541e-05,
"loss": 2.1214,
"step": 5140
},
{
"epoch": 1.74,
"grad_norm": 0.169921875,
"learning_rate": 9.903113209758096e-06,
"loss": 2.1239,
"step": 5145
},
{
"epoch": 1.74,
"grad_norm": 0.16796875,
"learning_rate": 9.775228793164826e-06,
"loss": 2.0882,
"step": 5150
},
{
"epoch": 1.75,
"grad_norm": 0.1689453125,
"learning_rate": 9.6481330289099e-06,
"loss": 2.0986,
"step": 5155
},
{
"epoch": 1.75,
"grad_norm": 0.1669921875,
"learning_rate": 9.521827027934038e-06,
"loss": 2.1189,
"step": 5160
},
{
"epoch": 1.75,
"grad_norm": 0.1689453125,
"learning_rate": 9.396311894274645e-06,
"loss": 2.1268,
"step": 5165
},
{
"epoch": 1.75,
"grad_norm": 0.1669921875,
"learning_rate": 9.271588725056201e-06,
"loss": 2.1027,
"step": 5170
},
{
"epoch": 1.75,
"grad_norm": 0.169921875,
"learning_rate": 9.147658610480625e-06,
"loss": 2.097,
"step": 5175
},
{
"epoch": 1.75,
"grad_norm": 0.1689453125,
"learning_rate": 9.024522633817756e-06,
"loss": 2.124,
"step": 5180
},
{
"epoch": 1.76,
"grad_norm": 0.169921875,
"learning_rate": 8.90218187139591e-06,
"loss": 2.1412,
"step": 5185
},
{
"epoch": 1.76,
"grad_norm": 0.1689453125,
"learning_rate": 8.780637392592495e-06,
"loss": 2.1046,
"step": 5190
},
{
"epoch": 1.76,
"grad_norm": 0.16796875,
"learning_rate": 8.659890259824532e-06,
"loss": 2.1173,
"step": 5195
},
{
"epoch": 1.76,
"grad_norm": 0.1630859375,
"learning_rate": 8.53994152853952e-06,
"loss": 2.1193,
"step": 5200
},
{
"epoch": 1.76,
"grad_norm": 0.1689453125,
"learning_rate": 8.420792247206177e-06,
"loss": 2.1264,
"step": 5205
},
{
"epoch": 1.76,
"grad_norm": 0.16796875,
"learning_rate": 8.302443457305209e-06,
"loss": 2.0896,
"step": 5210
},
{
"epoch": 1.77,
"grad_norm": 0.1708984375,
"learning_rate": 8.184896193320246e-06,
"loss": 2.1078,
"step": 5215
},
{
"epoch": 1.77,
"grad_norm": 0.1689453125,
"learning_rate": 8.068151482728802e-06,
"loss": 2.0992,
"step": 5220
},
{
"epoch": 1.77,
"grad_norm": 0.16796875,
"learning_rate": 7.952210345993339e-06,
"loss": 2.1263,
"step": 5225
},
{
"epoch": 1.77,
"grad_norm": 0.16796875,
"learning_rate": 7.837073796552241e-06,
"loss": 2.0959,
"step": 5230
},
{
"epoch": 1.77,
"grad_norm": 0.169921875,
"learning_rate": 7.72274284081106e-06,
"loss": 2.1071,
"step": 5235
},
{
"epoch": 1.77,
"grad_norm": 0.169921875,
"learning_rate": 7.609218478133628e-06,
"loss": 2.1182,
"step": 5240
},
{
"epoch": 1.78,
"grad_norm": 0.1669921875,
"learning_rate": 7.4965017008334245e-06,
"loss": 2.0935,
"step": 5245
},
{
"epoch": 1.78,
"grad_norm": 0.169921875,
"learning_rate": 7.3845934941648046e-06,
"loss": 2.1126,
"step": 5250
},
{
"epoch": 1.78,
"grad_norm": 0.16796875,
"learning_rate": 7.2734948363144206e-06,
"loss": 2.0975,
"step": 5255
},
{
"epoch": 1.78,
"grad_norm": 0.16796875,
"learning_rate": 7.163206698392744e-06,
"loss": 2.1164,
"step": 5260
},
{
"epoch": 1.78,
"grad_norm": 0.1650390625,
"learning_rate": 7.0537300444254435e-06,
"loss": 2.1231,
"step": 5265
},
{
"epoch": 1.78,
"grad_norm": 0.169921875,
"learning_rate": 6.945065831345077e-06,
"loss": 2.1181,
"step": 5270
},
{
"epoch": 1.79,
"grad_norm": 0.1728515625,
"learning_rate": 6.837215008982633e-06,
"loss": 2.1167,
"step": 5275
},
{
"epoch": 1.79,
"grad_norm": 0.169921875,
"learning_rate": 6.7301785200593046e-06,
"loss": 2.1473,
"step": 5280
},
{
"epoch": 1.79,
"grad_norm": 0.1748046875,
"learning_rate": 6.623957300178207e-06,
"loss": 2.0955,
"step": 5285
},
{
"epoch": 1.79,
"grad_norm": 0.16796875,
"learning_rate": 6.518552277816215e-06,
"loss": 2.1281,
"step": 5290
},
{
"epoch": 1.79,
"grad_norm": 0.169921875,
"learning_rate": 6.413964374315851e-06,
"loss": 2.1291,
"step": 5295
},
{
"epoch": 1.8,
"grad_norm": 0.1669921875,
"learning_rate": 6.31019450387721e-06,
"loss": 2.1031,
"step": 5300
},
{
"epoch": 1.8,
"grad_norm": 0.17578125,
"learning_rate": 6.207243573549959e-06,
"loss": 2.1046,
"step": 5305
},
{
"epoch": 1.8,
"grad_norm": 0.1689453125,
"learning_rate": 6.1051124832254944e-06,
"loss": 2.0975,
"step": 5310
},
{
"epoch": 1.8,
"grad_norm": 0.1689453125,
"learning_rate": 6.003802125628966e-06,
"loss": 2.1329,
"step": 5315
},
{
"epoch": 1.8,
"grad_norm": 0.171875,
"learning_rate": 5.903313386311527e-06,
"loss": 2.0939,
"step": 5320
},
{
"epoch": 1.8,
"grad_norm": 0.1689453125,
"learning_rate": 5.803647143642554e-06,
"loss": 2.1194,
"step": 5325
},
{
"epoch": 1.81,
"grad_norm": 0.169921875,
"learning_rate": 5.704804268802077e-06,
"loss": 2.1236,
"step": 5330
},
{
"epoch": 1.81,
"grad_norm": 0.1748046875,
"learning_rate": 5.606785625773048e-06,
"loss": 2.1484,
"step": 5335
},
{
"epoch": 1.81,
"grad_norm": 0.1748046875,
"learning_rate": 5.5095920713338134e-06,
"loss": 2.14,
"step": 5340
},
{
"epoch": 1.81,
"grad_norm": 0.17578125,
"learning_rate": 5.413224455050692e-06,
"loss": 2.128,
"step": 5345
},
{
"epoch": 1.81,
"grad_norm": 0.1689453125,
"learning_rate": 5.3176836192704414e-06,
"loss": 2.1357,
"step": 5350
},
{
"epoch": 1.81,
"grad_norm": 0.16796875,
"learning_rate": 5.222970399112981e-06,
"loss": 2.1191,
"step": 5355
},
{
"epoch": 1.82,
"grad_norm": 0.16796875,
"learning_rate": 5.12908562246408e-06,
"loss": 2.1321,
"step": 5360
},
{
"epoch": 1.82,
"grad_norm": 0.171875,
"learning_rate": 5.036030109968082e-06,
"loss": 2.1284,
"step": 5365
},
{
"epoch": 1.82,
"grad_norm": 0.1650390625,
"learning_rate": 4.9438046750207465e-06,
"loss": 2.117,
"step": 5370
},
{
"epoch": 1.82,
"grad_norm": 0.16796875,
"learning_rate": 4.8524101237621635e-06,
"loss": 2.1062,
"step": 5375
},
{
"epoch": 1.82,
"grad_norm": 0.1689453125,
"learning_rate": 4.7618472550696954e-06,
"loss": 2.1147,
"step": 5380
},
{
"epoch": 1.82,
"grad_norm": 0.1728515625,
"learning_rate": 4.67211686055099e-06,
"loss": 2.1405,
"step": 5385
},
{
"epoch": 1.83,
"grad_norm": 0.1728515625,
"learning_rate": 4.583219724537046e-06,
"loss": 2.1145,
"step": 5390
},
{
"epoch": 1.83,
"grad_norm": 0.16796875,
"learning_rate": 4.495156624075347e-06,
"loss": 2.1064,
"step": 5395
},
{
"epoch": 1.83,
"grad_norm": 0.169921875,
"learning_rate": 4.407928328923194e-06,
"loss": 2.1347,
"step": 5400
},
{
"epoch": 1.83,
"grad_norm": 0.1728515625,
"learning_rate": 4.321535601540738e-06,
"loss": 2.1005,
"step": 5405
},
{
"epoch": 1.83,
"grad_norm": 0.16796875,
"learning_rate": 4.2359791970845496e-06,
"loss": 2.0963,
"step": 5410
},
{
"epoch": 1.83,
"grad_norm": 0.1708984375,
"learning_rate": 4.1512598634009025e-06,
"loss": 2.0732,
"step": 5415
},
{
"epoch": 1.84,
"grad_norm": 0.1689453125,
"learning_rate": 4.067378341019257e-06,
"loss": 2.1268,
"step": 5420
},
{
"epoch": 1.84,
"grad_norm": 0.173828125,
"learning_rate": 3.984335363145752e-06,
"loss": 2.1028,
"step": 5425
},
{
"epoch": 1.84,
"grad_norm": 0.1708984375,
"learning_rate": 3.902131655656893e-06,
"loss": 2.0816,
"step": 5430
},
{
"epoch": 1.84,
"grad_norm": 0.16796875,
"learning_rate": 3.820767937093095e-06,
"loss": 2.1153,
"step": 5435
},
{
"epoch": 1.84,
"grad_norm": 0.166015625,
"learning_rate": 3.740244918652469e-06,
"loss": 2.1147,
"step": 5440
},
{
"epoch": 1.84,
"grad_norm": 0.169921875,
"learning_rate": 3.6605633041846053e-06,
"loss": 2.0833,
"step": 5445
},
{
"epoch": 1.85,
"grad_norm": 0.1689453125,
"learning_rate": 3.581723790184377e-06,
"loss": 2.1154,
"step": 5450
},
{
"epoch": 1.85,
"grad_norm": 0.1650390625,
"learning_rate": 3.503727065785878e-06,
"loss": 2.1346,
"step": 5455
},
{
"epoch": 1.85,
"grad_norm": 0.173828125,
"learning_rate": 3.4265738127564286e-06,
"loss": 2.1045,
"step": 5460
},
{
"epoch": 1.85,
"grad_norm": 0.171875,
"learning_rate": 3.350264705490569e-06,
"loss": 2.1144,
"step": 5465
},
{
"epoch": 1.85,
"grad_norm": 0.1708984375,
"learning_rate": 3.2748004110041863e-06,
"loss": 2.1074,
"step": 5470
},
{
"epoch": 1.85,
"grad_norm": 0.1669921875,
"learning_rate": 3.2001815889286856e-06,
"loss": 2.0837,
"step": 5475
},
{
"epoch": 1.86,
"grad_norm": 0.1689453125,
"learning_rate": 3.126408891505217e-06,
"loss": 2.0873,
"step": 5480
},
{
"epoch": 1.86,
"grad_norm": 0.16796875,
"learning_rate": 3.0534829635789684e-06,
"loss": 2.1231,
"step": 5485
},
{
"epoch": 1.86,
"grad_norm": 0.1669921875,
"learning_rate": 2.9814044425935606e-06,
"loss": 2.1073,
"step": 5490
},
{
"epoch": 1.86,
"grad_norm": 0.1689453125,
"learning_rate": 2.910173958585416e-06,
"loss": 2.0818,
"step": 5495
},
{
"epoch": 1.86,
"grad_norm": 0.169921875,
"learning_rate": 2.8397921341783317e-06,
"loss": 2.0771,
"step": 5500
},
{
"epoch": 1.86,
"grad_norm": 0.1669921875,
"learning_rate": 2.770259584577972e-06,
"loss": 2.1079,
"step": 5505
},
{
"epoch": 1.87,
"grad_norm": 0.1650390625,
"learning_rate": 2.7015769175665063e-06,
"loss": 2.0804,
"step": 5510
},
{
"epoch": 1.87,
"grad_norm": 0.169921875,
"learning_rate": 2.633744733497312e-06,
"loss": 2.1227,
"step": 5515
},
{
"epoch": 1.87,
"grad_norm": 0.1767578125,
"learning_rate": 2.5667636252897143e-06,
"loss": 2.1088,
"step": 5520
},
{
"epoch": 1.87,
"grad_norm": 0.1630859375,
"learning_rate": 2.5006341784238107e-06,
"loss": 2.0958,
"step": 5525
},
{
"epoch": 1.87,
"grad_norm": 0.171875,
"learning_rate": 2.435356970935354e-06,
"loss": 2.1027,
"step": 5530
},
{
"epoch": 1.87,
"grad_norm": 0.171875,
"learning_rate": 2.370932573410667e-06,
"loss": 2.1082,
"step": 5535
},
{
"epoch": 1.88,
"grad_norm": 0.171875,
"learning_rate": 2.3073615489817235e-06,
"loss": 2.1316,
"step": 5540
},
{
"epoch": 1.88,
"grad_norm": 0.16796875,
"learning_rate": 2.244644453321165e-06,
"loss": 2.0948,
"step": 5545
},
{
"epoch": 1.88,
"grad_norm": 0.1728515625,
"learning_rate": 2.1827818346374482e-06,
"loss": 2.1408,
"step": 5550
},
{
"epoch": 1.88,
"grad_norm": 0.169921875,
"learning_rate": 2.121774233670104e-06,
"loss": 2.1094,
"step": 5555
},
{
"epoch": 1.88,
"grad_norm": 0.16796875,
"learning_rate": 2.0616221836849638e-06,
"loss": 2.1169,
"step": 5560
},
{
"epoch": 1.88,
"grad_norm": 0.1708984375,
"learning_rate": 2.0023262104694852e-06,
"loss": 2.1126,
"step": 5565
},
{
"epoch": 1.89,
"grad_norm": 0.1708984375,
"learning_rate": 1.9438868323282124e-06,
"loss": 2.118,
"step": 5570
},
{
"epoch": 1.89,
"grad_norm": 0.1689453125,
"learning_rate": 1.8863045600782003e-06,
"loss": 2.1207,
"step": 5575
},
{
"epoch": 1.89,
"grad_norm": 0.1748046875,
"learning_rate": 1.8295798970445754e-06,
"loss": 2.1231,
"step": 5580
},
{
"epoch": 1.89,
"grad_norm": 0.1728515625,
"learning_rate": 1.7737133390561046e-06,
"loss": 2.112,
"step": 5585
},
{
"epoch": 1.89,
"grad_norm": 0.16796875,
"learning_rate": 1.7187053744409098e-06,
"loss": 2.098,
"step": 5590
},
{
"epoch": 1.9,
"grad_norm": 0.169921875,
"learning_rate": 1.6645564840221396e-06,
"loss": 2.1042,
"step": 5595
},
{
"epoch": 1.9,
"grad_norm": 0.1669921875,
"learning_rate": 1.6112671411138036e-06,
"loss": 2.086,
"step": 5600
},
{
"epoch": 1.9,
"grad_norm": 0.1669921875,
"learning_rate": 1.5588378115166669e-06,
"loss": 2.0714,
"step": 5605
},
{
"epoch": 1.9,
"grad_norm": 0.16796875,
"learning_rate": 1.5072689535141072e-06,
"loss": 2.1117,
"step": 5610
},
{
"epoch": 1.9,
"grad_norm": 0.16796875,
"learning_rate": 1.4565610178681299e-06,
"loss": 2.1318,
"step": 5615
},
{
"epoch": 1.9,
"grad_norm": 0.169921875,
"learning_rate": 1.4067144478154604e-06,
"loss": 2.116,
"step": 5620
},
{
"epoch": 1.91,
"grad_norm": 0.1689453125,
"learning_rate": 1.3577296790636684e-06,
"loss": 2.1302,
"step": 5625
},
{
"epoch": 1.91,
"grad_norm": 0.1708984375,
"learning_rate": 1.3096071397873056e-06,
"loss": 2.0978,
"step": 5630
},
{
"epoch": 1.91,
"grad_norm": 0.166015625,
"learning_rate": 1.2623472506242184e-06,
"loss": 2.1322,
"step": 5635
},
{
"epoch": 1.91,
"grad_norm": 0.171875,
"learning_rate": 1.2159504246718522e-06,
"loss": 2.1102,
"step": 5640
},
{
"epoch": 1.91,
"grad_norm": 0.16796875,
"learning_rate": 1.1704170674836313e-06,
"loss": 2.1059,
"step": 5645
},
{
"epoch": 1.91,
"grad_norm": 0.1630859375,
"learning_rate": 1.125747577065428e-06,
"loss": 2.1137,
"step": 5650
},
{
"epoch": 1.92,
"grad_norm": 0.169921875,
"learning_rate": 1.0819423438720665e-06,
"loss": 2.1135,
"step": 5655
},
{
"epoch": 1.92,
"grad_norm": 0.1748046875,
"learning_rate": 1.0390017508039473e-06,
"loss": 2.1429,
"step": 5660
},
{
"epoch": 1.92,
"grad_norm": 0.17578125,
"learning_rate": 9.969261732036605e-07,
"loss": 2.1312,
"step": 5665
},
{
"epoch": 1.92,
"grad_norm": 0.1650390625,
"learning_rate": 9.557159788526892e-07,
"loss": 2.1151,
"step": 5670
},
{
"epoch": 1.92,
"grad_norm": 0.1708984375,
"learning_rate": 9.153715279682784e-07,
"loss": 2.1337,
"step": 5675
},
{
"epoch": 1.92,
"grad_norm": 0.1689453125,
"learning_rate": 8.75893173200204e-07,
"loss": 2.1069,
"step": 5680
},
{
"epoch": 1.93,
"grad_norm": 0.169921875,
"learning_rate": 8.372812596277091e-07,
"loss": 2.1095,
"step": 5685
},
{
"epoch": 1.93,
"grad_norm": 0.1689453125,
"learning_rate": 7.99536124756517e-07,
"loss": 2.0928,
"step": 5690
},
{
"epoch": 1.93,
"grad_norm": 0.1708984375,
"learning_rate": 7.62658098515856e-07,
"loss": 2.0955,
"step": 5695
},
{
"epoch": 1.93,
"grad_norm": 0.1689453125,
"learning_rate": 7.266475032555619e-07,
"loss": 2.1201,
"step": 5700
},
{
"epoch": 1.93,
"grad_norm": 0.1708984375,
"learning_rate": 6.915046537433023e-07,
"loss": 2.1026,
"step": 5705
},
{
"epoch": 1.93,
"grad_norm": 0.1669921875,
"learning_rate": 6.572298571618118e-07,
"loss": 2.1075,
"step": 5710
},
{
"epoch": 1.94,
"grad_norm": 0.1689453125,
"learning_rate": 6.238234131061616e-07,
"loss": 2.0959,
"step": 5715
},
{
"epoch": 1.94,
"grad_norm": 0.1689453125,
"learning_rate": 5.912856135812051e-07,
"loss": 2.1019,
"step": 5720
},
{
"epoch": 1.94,
"grad_norm": 0.16796875,
"learning_rate": 5.596167429989807e-07,
"loss": 2.1053,
"step": 5725
},
{
"epoch": 1.94,
"grad_norm": 0.169921875,
"learning_rate": 5.288170781762469e-07,
"loss": 2.1113,
"step": 5730
},
{
"epoch": 1.94,
"grad_norm": 0.1689453125,
"learning_rate": 4.98886888332073e-07,
"loss": 2.109,
"step": 5735
},
{
"epoch": 1.94,
"grad_norm": 0.16796875,
"learning_rate": 4.698264350854409e-07,
"loss": 2.1136,
"step": 5740
},
{
"epoch": 1.95,
"grad_norm": 0.1767578125,
"learning_rate": 4.416359724530139e-07,
"loss": 2.1142,
"step": 5745
},
{
"epoch": 1.95,
"grad_norm": 0.1708984375,
"learning_rate": 4.143157468468717e-07,
"loss": 2.1152,
"step": 5750
},
{
"epoch": 1.95,
"grad_norm": 0.1689453125,
"learning_rate": 3.878659970724008e-07,
"loss": 2.1069,
"step": 5755
},
{
"epoch": 1.95,
"grad_norm": 0.16796875,
"learning_rate": 3.622869543261298e-07,
"loss": 2.0821,
"step": 5760
},
{
"epoch": 1.95,
"grad_norm": 0.17578125,
"learning_rate": 3.3757884219383085e-07,
"loss": 2.1078,
"step": 5765
},
{
"epoch": 1.95,
"grad_norm": 0.1728515625,
"learning_rate": 3.1374187664844346e-07,
"loss": 2.1151,
"step": 5770
},
{
"epoch": 1.96,
"grad_norm": 0.16796875,
"learning_rate": 2.907762660482538e-07,
"loss": 2.1021,
"step": 5775
},
{
"epoch": 1.96,
"grad_norm": 0.1708984375,
"learning_rate": 2.6868221113505175e-07,
"loss": 2.1222,
"step": 5780
},
{
"epoch": 1.96,
"grad_norm": 0.166015625,
"learning_rate": 2.474599050323989e-07,
"loss": 2.123,
"step": 5785
},
{
"epoch": 1.96,
"grad_norm": 0.169921875,
"learning_rate": 2.271095332438966e-07,
"loss": 2.0891,
"step": 5790
},
{
"epoch": 1.96,
"grad_norm": 0.166015625,
"learning_rate": 2.076312736516206e-07,
"loss": 2.1282,
"step": 5795
},
{
"epoch": 1.96,
"grad_norm": 0.1689453125,
"learning_rate": 1.890252965145112e-07,
"loss": 2.0931,
"step": 5800
},
{
"epoch": 1.97,
"grad_norm": 0.166015625,
"learning_rate": 1.7129176446692984e-07,
"loss": 2.0959,
"step": 5805
},
{
"epoch": 1.97,
"grad_norm": 0.1689453125,
"learning_rate": 1.5443083251720503e-07,
"loss": 2.1329,
"step": 5810
},
{
"epoch": 1.97,
"grad_norm": 0.1689453125,
"learning_rate": 1.384426480462997e-07,
"loss": 2.0971,
"step": 5815
},
{
"epoch": 1.97,
"grad_norm": 0.16796875,
"learning_rate": 1.2332735080651248e-07,
"loss": 2.0809,
"step": 5820
},
{
"epoch": 1.97,
"grad_norm": 0.171875,
"learning_rate": 1.0908507292026748e-07,
"loss": 2.1332,
"step": 5825
},
{
"epoch": 1.97,
"grad_norm": 0.1650390625,
"learning_rate": 9.571593887891528e-08,
"loss": 2.1077,
"step": 5830
},
{
"epoch": 1.98,
"grad_norm": 0.171875,
"learning_rate": 8.322006554171146e-08,
"loss": 2.1421,
"step": 5835
},
{
"epoch": 1.98,
"grad_norm": 0.1669921875,
"learning_rate": 7.159756213476199e-08,
"loss": 2.0922,
"step": 5840
},
{
"epoch": 1.98,
"grad_norm": 0.1640625,
"learning_rate": 6.084853025005721e-08,
"loss": 2.1117,
"step": 5845
},
{
"epoch": 1.98,
"grad_norm": 0.169921875,
"learning_rate": 5.0973063844605986e-08,
"loss": 2.0981,
"step": 5850
},
{
"epoch": 1.98,
"grad_norm": 0.169921875,
"learning_rate": 4.1971249239591834e-08,
"loss": 2.094,
"step": 5855
},
{
"epoch": 1.98,
"grad_norm": 0.1640625,
"learning_rate": 3.384316511964025e-08,
"loss": 2.1075,
"step": 5860
},
{
"epoch": 1.99,
"grad_norm": 0.1689453125,
"learning_rate": 2.658888253211922e-08,
"loss": 2.1071,
"step": 5865
},
{
"epoch": 1.99,
"grad_norm": 0.171875,
"learning_rate": 2.0208464886517508e-08,
"loss": 2.105,
"step": 5870
},
{
"epoch": 1.99,
"grad_norm": 0.16796875,
"learning_rate": 1.4701967953911766e-08,
"loss": 2.1025,
"step": 5875
},
{
"epoch": 1.99,
"grad_norm": 0.1689453125,
"learning_rate": 1.0069439866422503e-08,
"loss": 2.1108,
"step": 5880
},
{
"epoch": 1.99,
"grad_norm": 0.171875,
"learning_rate": 6.3109211168699275e-09,
"loss": 2.1134,
"step": 5885
},
{
"epoch": 1.99,
"grad_norm": 0.1630859375,
"learning_rate": 3.4264445583631622e-09,
"loss": 2.114,
"step": 5890
},
{
"epoch": 2.0,
"grad_norm": 0.169921875,
"learning_rate": 1.4160354040448908e-09,
"loss": 2.0933,
"step": 5895
},
{
"epoch": 2.0,
"grad_norm": 0.169921875,
"learning_rate": 2.7971122683601023e-10,
"loss": 2.125,
"step": 5900
},
{
"epoch": 2.0,
"eval_loss": 2.1289539337158203,
"eval_runtime": 154.4105,
"eval_samples_per_second": 8.451,
"eval_steps_per_second": 1.062,
"step": 5904
},
{
"epoch": 2.0,
"step": 5904,
"total_flos": 6.077264558911652e+17,
"train_loss": 2.139555886670503,
"train_runtime": 20822.1805,
"train_samples_per_second": 2.269,
"train_steps_per_second": 0.284
}
],
"logging_steps": 5,
"max_steps": 5904,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"total_flos": 6.077264558911652e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}