{ "best_metric": null, "best_model_checkpoint": null, "epoch": 29.99962510778151, "eval_steps": 500, "global_step": 1200330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 7.497500833055648e-05, "loss": 11.5654, "step": 3000 }, { "epoch": 0.15, "learning_rate": 0.00014995001666111296, "loss": 1.9313, "step": 6000 }, { "epoch": 0.22, "learning_rate": 0.0002249250249916694, "loss": 1.3886, "step": 9000 }, { "epoch": 0.3, "learning_rate": 0.0002999000333222259, "loss": 1.3023, "step": 12000 }, { "epoch": 0.37, "learning_rate": 0.0002992436418962473, "loss": 1.2746, "step": 15000 }, { "epoch": 0.45, "learning_rate": 0.0002984862739685911, "loss": 1.2224, "step": 18000 }, { "epoch": 0.52, "learning_rate": 0.00029772890604093486, "loss": 1.1809, "step": 21000 }, { "epoch": 0.6, "learning_rate": 0.00029697153811327864, "loss": 1.1405, "step": 24000 }, { "epoch": 0.67, "learning_rate": 0.0002962141701856224, "loss": 1.1178, "step": 27000 }, { "epoch": 0.75, "learning_rate": 0.0002954568022579662, "loss": 1.0984, "step": 30000 }, { "epoch": 0.82, "learning_rate": 0.00029469943433031, "loss": 1.0659, "step": 33000 }, { "epoch": 0.9, "learning_rate": 0.0002939420664026538, "loss": 1.0466, "step": 36000 }, { "epoch": 0.97, "learning_rate": 0.0002931846984749976, "loss": 1.0377, "step": 39000 }, { "epoch": 1.05, "learning_rate": 0.00029242733054734134, "loss": 1.005, "step": 42000 }, { "epoch": 1.12, "learning_rate": 0.0002916699626196851, "loss": 0.9926, "step": 45000 }, { "epoch": 1.2, "learning_rate": 0.00029091259469202896, "loss": 0.9863, "step": 48000 }, { "epoch": 1.27, "learning_rate": 0.0002901552267643727, "loss": 0.9723, "step": 51000 }, { "epoch": 1.35, "learning_rate": 0.00028939785883671653, "loss": 0.9571, "step": 54000 }, { "epoch": 1.42, "learning_rate": 0.0002886404909090603, "loss": 0.933, "step": 57000 }, { "epoch": 1.5, "learning_rate": 0.00028788312298140404, "loss": 0.9296, "step": 60000 }, { "epoch": 1.57, "learning_rate": 0.0002871257550537479, "loss": 0.9255, "step": 63000 }, { "epoch": 1.65, "learning_rate": 0.00028636838712609166, "loss": 0.9173, "step": 66000 }, { "epoch": 1.72, "learning_rate": 0.0002856110191984354, "loss": 0.9063, "step": 69000 }, { "epoch": 1.8, "learning_rate": 0.00028485365127077923, "loss": 0.9004, "step": 72000 }, { "epoch": 1.87, "learning_rate": 0.000284096283343123, "loss": 0.8997, "step": 75000 }, { "epoch": 1.95, "learning_rate": 0.00028333891541546674, "loss": 0.8943, "step": 78000 }, { "epoch": 2.02, "learning_rate": 0.0002825815474878106, "loss": 0.8797, "step": 81000 }, { "epoch": 2.1, "learning_rate": 0.00028182417956015436, "loss": 0.8501, "step": 84000 }, { "epoch": 2.17, "learning_rate": 0.0002810668116324981, "loss": 0.8479, "step": 87000 }, { "epoch": 2.25, "learning_rate": 0.00028030944370484193, "loss": 0.8451, "step": 90000 }, { "epoch": 2.32, "learning_rate": 0.0002795520757771857, "loss": 0.8315, "step": 93000 }, { "epoch": 2.4, "learning_rate": 0.0002787947078495295, "loss": 0.8353, "step": 96000 }, { "epoch": 2.47, "learning_rate": 0.0002780373399218733, "loss": 0.8271, "step": 99000 }, { "epoch": 2.55, "learning_rate": 0.00027727997199421706, "loss": 0.8129, "step": 102000 }, { "epoch": 2.62, "learning_rate": 0.00027652260406656084, "loss": 0.8125, "step": 105000 }, { "epoch": 2.7, "learning_rate": 0.0002757652361389046, "loss": 0.8119, "step": 108000 }, { "epoch": 2.77, "learning_rate": 0.0002750078682112484, "loss": 0.8, "step": 111000 }, { "epoch": 2.85, "learning_rate": 0.0002742505002835922, "loss": 0.7979, "step": 114000 }, { "epoch": 2.92, "learning_rate": 0.000273493132355936, "loss": 0.7914, "step": 117000 }, { "epoch": 3.0, "learning_rate": 0.00027273576442827976, "loss": 0.7916, "step": 120000 }, { "epoch": 3.07, "learning_rate": 0.00027197839650062354, "loss": 0.7542, "step": 123000 }, { "epoch": 3.15, "learning_rate": 0.0002712210285729673, "loss": 0.7643, "step": 126000 }, { "epoch": 3.22, "learning_rate": 0.0002704636606453111, "loss": 0.7576, "step": 129000 }, { "epoch": 3.3, "learning_rate": 0.0002697062927176549, "loss": 0.7551, "step": 132000 }, { "epoch": 3.37, "learning_rate": 0.0002689489247899987, "loss": 0.7537, "step": 135000 }, { "epoch": 3.45, "learning_rate": 0.00026819155686234246, "loss": 0.7536, "step": 138000 }, { "epoch": 3.52, "learning_rate": 0.00026743418893468624, "loss": 0.7466, "step": 141000 }, { "epoch": 3.6, "learning_rate": 0.00026667682100703, "loss": 0.7488, "step": 144000 }, { "epoch": 3.67, "learning_rate": 0.0002659194530793738, "loss": 0.7445, "step": 147000 }, { "epoch": 3.75, "learning_rate": 0.0002651620851517176, "loss": 0.7396, "step": 150000 }, { "epoch": 3.82, "learning_rate": 0.0002644047172240614, "loss": 0.7387, "step": 153000 }, { "epoch": 3.9, "learning_rate": 0.00026364734929640516, "loss": 0.7324, "step": 156000 }, { "epoch": 3.97, "learning_rate": 0.00026288998136874894, "loss": 0.7297, "step": 159000 }, { "epoch": 4.05, "learning_rate": 0.0002621326134410928, "loss": 0.7069, "step": 162000 }, { "epoch": 4.12, "learning_rate": 0.0002613752455134365, "loss": 0.7001, "step": 165000 }, { "epoch": 4.2, "learning_rate": 0.0002606178775857803, "loss": 0.6945, "step": 168000 }, { "epoch": 4.27, "learning_rate": 0.00025986050965812413, "loss": 0.7017, "step": 171000 }, { "epoch": 4.35, "learning_rate": 0.00025910314173046786, "loss": 0.7018, "step": 174000 }, { "epoch": 4.42, "learning_rate": 0.00025834577380281164, "loss": 0.6977, "step": 177000 }, { "epoch": 4.5, "learning_rate": 0.0002575884058751555, "loss": 0.6923, "step": 180000 }, { "epoch": 4.57, "learning_rate": 0.0002568310379474992, "loss": 0.6972, "step": 183000 }, { "epoch": 4.65, "learning_rate": 0.000256073670019843, "loss": 0.685, "step": 186000 }, { "epoch": 4.72, "learning_rate": 0.00025531630209218683, "loss": 0.6857, "step": 189000 }, { "epoch": 4.8, "learning_rate": 0.00025455893416453056, "loss": 0.6846, "step": 192000 }, { "epoch": 4.87, "learning_rate": 0.00025380156623687434, "loss": 0.6848, "step": 195000 }, { "epoch": 4.95, "learning_rate": 0.0002530441983092182, "loss": 0.6783, "step": 198000 }, { "epoch": 5.02, "learning_rate": 0.00025228683038156196, "loss": 0.6676, "step": 201000 }, { "epoch": 5.1, "learning_rate": 0.0002515294624539057, "loss": 0.6514, "step": 204000 }, { "epoch": 5.17, "learning_rate": 0.00025077209452624953, "loss": 0.6469, "step": 207000 }, { "epoch": 5.25, "learning_rate": 0.0002500147265985933, "loss": 0.6483, "step": 210000 }, { "epoch": 5.32, "learning_rate": 0.00024925735867093704, "loss": 0.6525, "step": 213000 }, { "epoch": 5.4, "learning_rate": 0.0002484999907432809, "loss": 0.6487, "step": 216000 }, { "epoch": 5.47, "learning_rate": 0.00024774262281562466, "loss": 0.6495, "step": 219000 }, { "epoch": 5.55, "learning_rate": 0.00024698525488796845, "loss": 0.645, "step": 222000 }, { "epoch": 5.62, "learning_rate": 0.00024622788696031223, "loss": 0.6517, "step": 225000 }, { "epoch": 5.7, "learning_rate": 0.000245470519032656, "loss": 0.6459, "step": 228000 }, { "epoch": 5.77, "learning_rate": 0.0002447131511049998, "loss": 0.6451, "step": 231000 }, { "epoch": 5.85, "learning_rate": 0.00024395578317734358, "loss": 0.6412, "step": 234000 }, { "epoch": 5.92, "learning_rate": 0.00024319841524968736, "loss": 0.6457, "step": 237000 }, { "epoch": 6.0, "learning_rate": 0.00024244104732203112, "loss": 0.64, "step": 240000 }, { "epoch": 6.07, "learning_rate": 0.00024168367939437493, "loss": 0.6137, "step": 243000 }, { "epoch": 6.15, "learning_rate": 0.0002409263114667187, "loss": 0.6131, "step": 246000 }, { "epoch": 6.22, "learning_rate": 0.0002401689435390625, "loss": 0.6153, "step": 249000 }, { "epoch": 6.3, "learning_rate": 0.00023941157561140628, "loss": 0.6193, "step": 252000 }, { "epoch": 6.37, "learning_rate": 0.00023865420768375006, "loss": 0.6161, "step": 255000 }, { "epoch": 6.45, "learning_rate": 0.00023789683975609387, "loss": 0.614, "step": 258000 }, { "epoch": 6.52, "learning_rate": 0.00023713947182843763, "loss": 0.6142, "step": 261000 }, { "epoch": 6.6, "learning_rate": 0.0002363821039007814, "loss": 0.6181, "step": 264000 }, { "epoch": 6.67, "learning_rate": 0.00023562473597312522, "loss": 0.6098, "step": 267000 }, { "epoch": 6.75, "learning_rate": 0.00023486736804546898, "loss": 0.6165, "step": 270000 }, { "epoch": 6.82, "learning_rate": 0.00023411000011781276, "loss": 0.612, "step": 273000 }, { "epoch": 6.9, "learning_rate": 0.00023335263219015657, "loss": 0.6109, "step": 276000 }, { "epoch": 6.97, "learning_rate": 0.00023259526426250033, "loss": 0.6141, "step": 279000 }, { "epoch": 7.05, "learning_rate": 0.0002318378963348441, "loss": 0.5912, "step": 282000 }, { "epoch": 7.12, "learning_rate": 0.00023108052840718792, "loss": 0.5838, "step": 285000 }, { "epoch": 7.2, "learning_rate": 0.00023032316047953168, "loss": 0.5823, "step": 288000 }, { "epoch": 7.27, "learning_rate": 0.00022956579255187546, "loss": 0.5836, "step": 291000 }, { "epoch": 7.35, "learning_rate": 0.00022880842462421927, "loss": 0.5857, "step": 294000 }, { "epoch": 7.42, "learning_rate": 0.00022805105669656303, "loss": 0.5872, "step": 297000 }, { "epoch": 7.5, "learning_rate": 0.0002272936887689068, "loss": 0.59, "step": 300000 }, { "epoch": 7.57, "learning_rate": 0.00022653632084125062, "loss": 0.5808, "step": 303000 }, { "epoch": 7.65, "learning_rate": 0.0002257789529135944, "loss": 0.5826, "step": 306000 }, { "epoch": 7.72, "learning_rate": 0.00022502158498593816, "loss": 0.5813, "step": 309000 }, { "epoch": 7.8, "learning_rate": 0.00022426421705828197, "loss": 0.5883, "step": 312000 }, { "epoch": 7.87, "learning_rate": 0.00022350684913062576, "loss": 0.5852, "step": 315000 }, { "epoch": 7.95, "learning_rate": 0.00022274948120296954, "loss": 0.5824, "step": 318000 }, { "epoch": 8.02, "learning_rate": 0.00022199211327531332, "loss": 0.575, "step": 321000 }, { "epoch": 8.1, "learning_rate": 0.0002212347453476571, "loss": 0.5561, "step": 324000 }, { "epoch": 8.17, "learning_rate": 0.0002204773774200009, "loss": 0.5596, "step": 327000 }, { "epoch": 8.25, "learning_rate": 0.00021972000949234467, "loss": 0.5603, "step": 330000 }, { "epoch": 8.32, "learning_rate": 0.00021896264156468846, "loss": 0.5575, "step": 333000 }, { "epoch": 8.4, "learning_rate": 0.00021820527363703224, "loss": 0.5545, "step": 336000 }, { "epoch": 8.47, "learning_rate": 0.00021744790570937602, "loss": 0.5629, "step": 339000 }, { "epoch": 8.55, "learning_rate": 0.00021669053778171983, "loss": 0.5572, "step": 342000 }, { "epoch": 8.62, "learning_rate": 0.0002159331698540636, "loss": 0.5602, "step": 345000 }, { "epoch": 8.7, "learning_rate": 0.00021517580192640737, "loss": 0.5521, "step": 348000 }, { "epoch": 8.77, "learning_rate": 0.00021441843399875118, "loss": 0.5582, "step": 351000 }, { "epoch": 8.85, "learning_rate": 0.00021366106607109497, "loss": 0.5539, "step": 354000 }, { "epoch": 8.92, "learning_rate": 0.00021290369814343872, "loss": 0.5589, "step": 357000 }, { "epoch": 9.0, "learning_rate": 0.00021214633021578253, "loss": 0.5549, "step": 360000 }, { "epoch": 9.07, "learning_rate": 0.00021138896228812632, "loss": 0.525, "step": 363000 }, { "epoch": 9.15, "learning_rate": 0.00021063159436047007, "loss": 0.5304, "step": 366000 }, { "epoch": 9.22, "learning_rate": 0.00020987422643281388, "loss": 0.5331, "step": 369000 }, { "epoch": 9.3, "learning_rate": 0.00020911685850515766, "loss": 0.5341, "step": 372000 }, { "epoch": 9.37, "learning_rate": 0.00020835949057750142, "loss": 0.5319, "step": 375000 }, { "epoch": 9.45, "learning_rate": 0.00020760212264984523, "loss": 0.5318, "step": 378000 }, { "epoch": 9.52, "learning_rate": 0.00020684475472218901, "loss": 0.5368, "step": 381000 }, { "epoch": 9.6, "learning_rate": 0.00020608738679453277, "loss": 0.532, "step": 384000 }, { "epoch": 9.67, "learning_rate": 0.00020533001886687658, "loss": 0.5353, "step": 387000 }, { "epoch": 9.75, "learning_rate": 0.00020457265093922036, "loss": 0.5329, "step": 390000 }, { "epoch": 9.82, "learning_rate": 0.00020381528301156415, "loss": 0.5354, "step": 393000 }, { "epoch": 9.9, "learning_rate": 0.00020305791508390793, "loss": 0.5375, "step": 396000 }, { "epoch": 9.97, "learning_rate": 0.00020230054715625171, "loss": 0.5377, "step": 399000 }, { "epoch": 10.05, "learning_rate": 0.0002015431792285955, "loss": 0.5191, "step": 402000 }, { "epoch": 10.12, "learning_rate": 0.00020078581130093928, "loss": 0.5057, "step": 405000 }, { "epoch": 10.2, "learning_rate": 0.00020002844337328306, "loss": 0.508, "step": 408000 }, { "epoch": 10.27, "learning_rate": 0.00019927107544562687, "loss": 0.5117, "step": 411000 }, { "epoch": 10.35, "learning_rate": 0.00019851370751797063, "loss": 0.5135, "step": 414000 }, { "epoch": 10.42, "learning_rate": 0.00019775633959031441, "loss": 0.5129, "step": 417000 }, { "epoch": 10.5, "learning_rate": 0.00019699897166265822, "loss": 0.511, "step": 420000 }, { "epoch": 10.57, "learning_rate": 0.00019624160373500198, "loss": 0.5095, "step": 423000 }, { "epoch": 10.65, "learning_rate": 0.0001954842358073458, "loss": 0.5129, "step": 426000 }, { "epoch": 10.72, "learning_rate": 0.00019472686787968957, "loss": 0.5124, "step": 429000 }, { "epoch": 10.8, "learning_rate": 0.00019396949995203333, "loss": 0.5138, "step": 432000 }, { "epoch": 10.87, "learning_rate": 0.00019321213202437714, "loss": 0.5125, "step": 435000 }, { "epoch": 10.95, "learning_rate": 0.00019245476409672092, "loss": 0.5153, "step": 438000 }, { "epoch": 11.02, "learning_rate": 0.00019169739616906468, "loss": 0.5022, "step": 441000 }, { "epoch": 11.1, "learning_rate": 0.0001909400282414085, "loss": 0.4908, "step": 444000 }, { "epoch": 11.17, "learning_rate": 0.00019018266031375227, "loss": 0.4908, "step": 447000 }, { "epoch": 11.25, "learning_rate": 0.00018942529238609603, "loss": 0.4907, "step": 450000 }, { "epoch": 11.32, "learning_rate": 0.00018866792445843984, "loss": 0.4895, "step": 453000 }, { "epoch": 11.4, "learning_rate": 0.00018791055653078362, "loss": 0.4948, "step": 456000 }, { "epoch": 11.47, "learning_rate": 0.00018715318860312743, "loss": 0.4908, "step": 459000 }, { "epoch": 11.55, "learning_rate": 0.0001863958206754712, "loss": 0.4931, "step": 462000 }, { "epoch": 11.62, "learning_rate": 0.00018563845274781497, "loss": 0.4945, "step": 465000 }, { "epoch": 11.7, "learning_rate": 0.00018488108482015878, "loss": 0.5005, "step": 468000 }, { "epoch": 11.77, "learning_rate": 0.00018412371689250254, "loss": 0.4974, "step": 471000 }, { "epoch": 11.85, "learning_rate": 0.00018336634896484632, "loss": 0.4997, "step": 474000 }, { "epoch": 11.92, "learning_rate": 0.00018260898103719013, "loss": 0.5027, "step": 477000 }, { "epoch": 12.0, "learning_rate": 0.0001818516131095339, "loss": 0.4962, "step": 480000 }, { "epoch": 12.07, "learning_rate": 0.00018109424518187767, "loss": 0.4766, "step": 483000 }, { "epoch": 12.15, "learning_rate": 0.00018033687725422148, "loss": 0.4782, "step": 486000 }, { "epoch": 12.22, "learning_rate": 0.00017957950932656524, "loss": 0.4793, "step": 489000 }, { "epoch": 12.3, "learning_rate": 0.00017882214139890902, "loss": 0.4769, "step": 492000 }, { "epoch": 12.37, "learning_rate": 0.00017806477347125283, "loss": 0.4801, "step": 495000 }, { "epoch": 12.45, "learning_rate": 0.0001773074055435966, "loss": 0.4783, "step": 498000 }, { "epoch": 12.52, "learning_rate": 0.0001765500376159404, "loss": 0.4804, "step": 501000 }, { "epoch": 12.6, "learning_rate": 0.00017579266968828418, "loss": 0.4758, "step": 504000 }, { "epoch": 12.67, "learning_rate": 0.00017503530176062794, "loss": 0.4761, "step": 507000 }, { "epoch": 12.75, "learning_rate": 0.00017427793383297175, "loss": 0.473, "step": 510000 }, { "epoch": 12.82, "learning_rate": 0.00017352056590531553, "loss": 0.4793, "step": 513000 }, { "epoch": 12.9, "learning_rate": 0.00017276319797765932, "loss": 0.4772, "step": 516000 }, { "epoch": 12.97, "learning_rate": 0.0001720058300500031, "loss": 0.4752, "step": 519000 }, { "epoch": 13.05, "learning_rate": 0.00017124846212234688, "loss": 0.4611, "step": 522000 }, { "epoch": 13.12, "learning_rate": 0.0001704910941946907, "loss": 0.4549, "step": 525000 }, { "epoch": 13.2, "learning_rate": 0.00016973372626703445, "loss": 0.456, "step": 528000 }, { "epoch": 13.27, "learning_rate": 0.00016897635833937823, "loss": 0.4596, "step": 531000 }, { "epoch": 13.35, "learning_rate": 0.00016821899041172204, "loss": 0.4629, "step": 534000 }, { "epoch": 13.42, "learning_rate": 0.0001674616224840658, "loss": 0.463, "step": 537000 }, { "epoch": 13.5, "learning_rate": 0.00016670425455640958, "loss": 0.461, "step": 540000 }, { "epoch": 13.57, "learning_rate": 0.0001659468866287534, "loss": 0.4631, "step": 543000 }, { "epoch": 13.65, "learning_rate": 0.00016518951870109715, "loss": 0.4627, "step": 546000 }, { "epoch": 13.72, "learning_rate": 0.00016443215077344093, "loss": 0.4639, "step": 549000 }, { "epoch": 13.8, "learning_rate": 0.00016367478284578474, "loss": 0.4663, "step": 552000 }, { "epoch": 13.87, "learning_rate": 0.0001629174149181285, "loss": 0.46, "step": 555000 }, { "epoch": 13.95, "learning_rate": 0.00016216004699047228, "loss": 0.4661, "step": 558000 }, { "epoch": 14.02, "learning_rate": 0.0001614026790628161, "loss": 0.4564, "step": 561000 }, { "epoch": 14.1, "learning_rate": 0.00016064531113515988, "loss": 0.4391, "step": 564000 }, { "epoch": 14.17, "learning_rate": 0.00015988794320750363, "loss": 0.4479, "step": 567000 }, { "epoch": 14.25, "learning_rate": 0.00015913057527984744, "loss": 0.4427, "step": 570000 }, { "epoch": 14.32, "learning_rate": 0.00015837320735219123, "loss": 0.4425, "step": 573000 }, { "epoch": 14.4, "learning_rate": 0.00015761583942453498, "loss": 0.4412, "step": 576000 }, { "epoch": 14.47, "learning_rate": 0.0001568584714968788, "loss": 0.4488, "step": 579000 }, { "epoch": 14.55, "learning_rate": 0.00015610110356922258, "loss": 0.4464, "step": 582000 }, { "epoch": 14.62, "learning_rate": 0.00015534373564156636, "loss": 0.4507, "step": 585000 }, { "epoch": 14.7, "learning_rate": 0.00015458636771391014, "loss": 0.4476, "step": 588000 }, { "epoch": 14.77, "learning_rate": 0.00015382899978625393, "loss": 0.4462, "step": 591000 }, { "epoch": 14.85, "learning_rate": 0.0001530716318585977, "loss": 0.4493, "step": 594000 }, { "epoch": 14.92, "learning_rate": 0.0001523142639309415, "loss": 0.4478, "step": 597000 }, { "epoch": 15.0, "learning_rate": 0.00015155689600328528, "loss": 0.4483, "step": 600000 }, { "epoch": 15.07, "learning_rate": 0.00015079952807562906, "loss": 0.4276, "step": 603000 }, { "epoch": 15.15, "learning_rate": 0.00015004216014797284, "loss": 0.4259, "step": 606000 }, { "epoch": 15.22, "learning_rate": 0.00014928479222031665, "loss": 0.4263, "step": 609000 }, { "epoch": 15.3, "learning_rate": 0.0001485274242926604, "loss": 0.4302, "step": 612000 }, { "epoch": 15.37, "learning_rate": 0.0001477700563650042, "loss": 0.4302, "step": 615000 }, { "epoch": 15.45, "learning_rate": 0.000147012688437348, "loss": 0.4289, "step": 618000 }, { "epoch": 15.52, "learning_rate": 0.00014625532050969176, "loss": 0.4311, "step": 621000 }, { "epoch": 15.6, "learning_rate": 0.00014549795258203554, "loss": 0.4327, "step": 624000 }, { "epoch": 15.67, "learning_rate": 0.00014474058465437935, "loss": 0.4315, "step": 627000 }, { "epoch": 15.75, "learning_rate": 0.0001439832167267231, "loss": 0.4305, "step": 630000 }, { "epoch": 15.82, "learning_rate": 0.00014322584879906692, "loss": 0.429, "step": 633000 }, { "epoch": 15.9, "learning_rate": 0.0001424684808714107, "loss": 0.4288, "step": 636000 }, { "epoch": 15.97, "learning_rate": 0.00014171111294375449, "loss": 0.4309, "step": 639000 }, { "epoch": 16.05, "learning_rate": 0.00014095374501609827, "loss": 0.4179, "step": 642000 }, { "epoch": 16.12, "learning_rate": 0.00014019637708844205, "loss": 0.4098, "step": 645000 }, { "epoch": 16.2, "learning_rate": 0.00013943900916078584, "loss": 0.4136, "step": 648000 }, { "epoch": 16.27, "learning_rate": 0.00013868164123312962, "loss": 0.4115, "step": 651000 }, { "epoch": 16.35, "learning_rate": 0.0001379242733054734, "loss": 0.4192, "step": 654000 }, { "epoch": 16.42, "learning_rate": 0.00013716690537781719, "loss": 0.4159, "step": 657000 }, { "epoch": 16.5, "learning_rate": 0.00013640953745016097, "loss": 0.4176, "step": 660000 }, { "epoch": 16.57, "learning_rate": 0.00013565216952250475, "loss": 0.4166, "step": 663000 }, { "epoch": 16.65, "learning_rate": 0.00013489480159484854, "loss": 0.4214, "step": 666000 }, { "epoch": 16.72, "learning_rate": 0.00013413743366719232, "loss": 0.4121, "step": 669000 }, { "epoch": 16.8, "learning_rate": 0.0001333800657395361, "loss": 0.414, "step": 672000 }, { "epoch": 16.87, "learning_rate": 0.00013262269781187989, "loss": 0.4159, "step": 675000 }, { "epoch": 16.95, "learning_rate": 0.00013186532988422367, "loss": 0.4135, "step": 678000 }, { "epoch": 17.02, "learning_rate": 0.00013110796195656748, "loss": 0.4098, "step": 681000 }, { "epoch": 17.1, "learning_rate": 0.00013035059402891124, "loss": 0.3947, "step": 684000 }, { "epoch": 17.17, "learning_rate": 0.00012959322610125502, "loss": 0.4026, "step": 687000 }, { "epoch": 17.25, "learning_rate": 0.00012883585817359883, "loss": 0.397, "step": 690000 }, { "epoch": 17.32, "learning_rate": 0.0001280784902459426, "loss": 0.3998, "step": 693000 }, { "epoch": 17.39, "learning_rate": 0.00012732112231828637, "loss": 0.3988, "step": 696000 }, { "epoch": 17.47, "learning_rate": 0.00012656375439063018, "loss": 0.4015, "step": 699000 }, { "epoch": 17.54, "learning_rate": 0.00012580638646297396, "loss": 0.402, "step": 702000 }, { "epoch": 17.62, "learning_rate": 0.00012504901853531775, "loss": 0.4018, "step": 705000 }, { "epoch": 17.69, "learning_rate": 0.00012429165060766153, "loss": 0.3987, "step": 708000 }, { "epoch": 17.77, "learning_rate": 0.0001235342826800053, "loss": 0.3974, "step": 711000 }, { "epoch": 17.84, "learning_rate": 0.0001227769147523491, "loss": 0.4006, "step": 714000 }, { "epoch": 17.92, "learning_rate": 0.00012201954682469288, "loss": 0.4028, "step": 717000 }, { "epoch": 17.99, "learning_rate": 0.00012126217889703665, "loss": 0.4019, "step": 720000 }, { "epoch": 18.07, "learning_rate": 0.00012050481096938045, "loss": 0.3853, "step": 723000 }, { "epoch": 18.14, "learning_rate": 0.00011974744304172423, "loss": 0.387, "step": 726000 }, { "epoch": 18.22, "learning_rate": 0.00011899007511406801, "loss": 0.3822, "step": 729000 }, { "epoch": 18.29, "learning_rate": 0.0001182327071864118, "loss": 0.3861, "step": 732000 }, { "epoch": 18.37, "learning_rate": 0.00011747533925875558, "loss": 0.3877, "step": 735000 }, { "epoch": 18.44, "learning_rate": 0.00011671797133109938, "loss": 0.3867, "step": 738000 }, { "epoch": 18.52, "learning_rate": 0.00011596060340344316, "loss": 0.388, "step": 741000 }, { "epoch": 18.59, "learning_rate": 0.00011520323547578693, "loss": 0.3889, "step": 744000 }, { "epoch": 18.67, "learning_rate": 0.00011444586754813073, "loss": 0.3867, "step": 747000 }, { "epoch": 18.74, "learning_rate": 0.00011368849962047451, "loss": 0.3897, "step": 750000 }, { "epoch": 18.82, "learning_rate": 0.00011293113169281828, "loss": 0.3873, "step": 753000 }, { "epoch": 18.89, "learning_rate": 0.00011217376376516208, "loss": 0.3881, "step": 756000 }, { "epoch": 18.97, "learning_rate": 0.00011141639583750586, "loss": 0.3915, "step": 759000 }, { "epoch": 19.04, "learning_rate": 0.00011065902790984966, "loss": 0.3766, "step": 762000 }, { "epoch": 19.12, "learning_rate": 0.00010990165998219343, "loss": 0.3757, "step": 765000 }, { "epoch": 19.19, "learning_rate": 0.00010914429205453721, "loss": 0.3705, "step": 768000 }, { "epoch": 19.27, "learning_rate": 0.000108386924126881, "loss": 0.3728, "step": 771000 }, { "epoch": 19.34, "learning_rate": 0.00010762955619922477, "loss": 0.3756, "step": 774000 }, { "epoch": 19.42, "learning_rate": 0.00010687218827156856, "loss": 0.3772, "step": 777000 }, { "epoch": 19.49, "learning_rate": 0.00010611482034391236, "loss": 0.3761, "step": 780000 }, { "epoch": 19.57, "learning_rate": 0.00010535745241625614, "loss": 0.3739, "step": 783000 }, { "epoch": 19.64, "learning_rate": 0.00010460008448859992, "loss": 0.371, "step": 786000 }, { "epoch": 19.72, "learning_rate": 0.0001038427165609437, "loss": 0.3721, "step": 789000 }, { "epoch": 19.79, "learning_rate": 0.00010308534863328749, "loss": 0.3733, "step": 792000 }, { "epoch": 19.87, "learning_rate": 0.00010232798070563129, "loss": 0.3742, "step": 795000 }, { "epoch": 19.94, "learning_rate": 0.00010157061277797505, "loss": 0.3732, "step": 798000 }, { "epoch": 20.02, "learning_rate": 0.00010081324485031884, "loss": 0.3703, "step": 801000 }, { "epoch": 20.09, "learning_rate": 0.00010005587692266263, "loss": 0.3609, "step": 804000 }, { "epoch": 20.17, "learning_rate": 9.92985089950064e-05, "loss": 0.3604, "step": 807000 }, { "epoch": 20.24, "learning_rate": 9.85411410673502e-05, "loss": 0.3614, "step": 810000 }, { "epoch": 20.32, "learning_rate": 9.778377313969398e-05, "loss": 0.3617, "step": 813000 }, { "epoch": 20.39, "learning_rate": 9.702640521203775e-05, "loss": 0.3611, "step": 816000 }, { "epoch": 20.47, "learning_rate": 9.626903728438155e-05, "loss": 0.3607, "step": 819000 }, { "epoch": 20.54, "learning_rate": 9.551166935672533e-05, "loss": 0.3605, "step": 822000 }, { "epoch": 20.62, "learning_rate": 9.475430142906912e-05, "loss": 0.3589, "step": 825000 }, { "epoch": 20.69, "learning_rate": 9.39969335014129e-05, "loss": 0.3586, "step": 828000 }, { "epoch": 20.77, "learning_rate": 9.323956557375668e-05, "loss": 0.3626, "step": 831000 }, { "epoch": 20.84, "learning_rate": 9.248219764610048e-05, "loss": 0.3616, "step": 834000 }, { "epoch": 20.92, "learning_rate": 9.172482971844426e-05, "loss": 0.3595, "step": 837000 }, { "epoch": 20.99, "learning_rate": 9.096746179078803e-05, "loss": 0.3624, "step": 840000 }, { "epoch": 21.07, "learning_rate": 9.021009386313183e-05, "loss": 0.3468, "step": 843000 }, { "epoch": 21.14, "learning_rate": 8.945272593547561e-05, "loss": 0.3475, "step": 846000 }, { "epoch": 21.22, "learning_rate": 8.869535800781938e-05, "loss": 0.3486, "step": 849000 }, { "epoch": 21.29, "learning_rate": 8.793799008016318e-05, "loss": 0.3454, "step": 852000 }, { "epoch": 21.37, "learning_rate": 8.718062215250696e-05, "loss": 0.3496, "step": 855000 }, { "epoch": 21.44, "learning_rate": 8.642325422485073e-05, "loss": 0.3478, "step": 858000 }, { "epoch": 21.52, "learning_rate": 8.566588629719453e-05, "loss": 0.346, "step": 861000 }, { "epoch": 21.59, "learning_rate": 8.490851836953831e-05, "loss": 0.3513, "step": 864000 }, { "epoch": 21.67, "learning_rate": 8.415115044188211e-05, "loss": 0.3524, "step": 867000 }, { "epoch": 21.74, "learning_rate": 8.339378251422588e-05, "loss": 0.3494, "step": 870000 }, { "epoch": 21.82, "learning_rate": 8.263641458656966e-05, "loss": 0.3521, "step": 873000 }, { "epoch": 21.89, "learning_rate": 8.187904665891346e-05, "loss": 0.3509, "step": 876000 }, { "epoch": 21.97, "learning_rate": 8.112167873125724e-05, "loss": 0.3501, "step": 879000 }, { "epoch": 22.04, "learning_rate": 8.036431080360101e-05, "loss": 0.3394, "step": 882000 }, { "epoch": 22.12, "learning_rate": 7.960694287594481e-05, "loss": 0.3357, "step": 885000 }, { "epoch": 22.19, "learning_rate": 7.88495749482886e-05, "loss": 0.3378, "step": 888000 }, { "epoch": 22.27, "learning_rate": 7.809220702063239e-05, "loss": 0.3362, "step": 891000 }, { "epoch": 22.34, "learning_rate": 7.733483909297616e-05, "loss": 0.3382, "step": 894000 }, { "epoch": 22.42, "learning_rate": 7.657747116531994e-05, "loss": 0.3407, "step": 897000 }, { "epoch": 22.49, "learning_rate": 7.582010323766374e-05, "loss": 0.3381, "step": 900000 }, { "epoch": 22.57, "learning_rate": 7.506273531000751e-05, "loss": 0.3378, "step": 903000 }, { "epoch": 22.64, "learning_rate": 7.430536738235131e-05, "loss": 0.3368, "step": 906000 }, { "epoch": 22.72, "learning_rate": 7.354799945469509e-05, "loss": 0.3384, "step": 909000 }, { "epoch": 22.79, "learning_rate": 7.279063152703887e-05, "loss": 0.3351, "step": 912000 }, { "epoch": 22.87, "learning_rate": 7.203326359938266e-05, "loss": 0.3399, "step": 915000 }, { "epoch": 22.94, "learning_rate": 7.127589567172644e-05, "loss": 0.3366, "step": 918000 }, { "epoch": 23.02, "learning_rate": 7.051852774407022e-05, "loss": 0.3313, "step": 921000 }, { "epoch": 23.09, "learning_rate": 6.976115981641401e-05, "loss": 0.3291, "step": 924000 }, { "epoch": 23.17, "learning_rate": 6.900379188875779e-05, "loss": 0.3269, "step": 927000 }, { "epoch": 23.24, "learning_rate": 6.824642396110157e-05, "loss": 0.324, "step": 930000 }, { "epoch": 23.32, "learning_rate": 6.748905603344537e-05, "loss": 0.3249, "step": 933000 }, { "epoch": 23.39, "learning_rate": 6.673168810578914e-05, "loss": 0.3257, "step": 936000 }, { "epoch": 23.47, "learning_rate": 6.597432017813294e-05, "loss": 0.3266, "step": 939000 }, { "epoch": 23.54, "learning_rate": 6.521695225047672e-05, "loss": 0.3275, "step": 942000 }, { "epoch": 23.62, "learning_rate": 6.44595843228205e-05, "loss": 0.3279, "step": 945000 }, { "epoch": 23.69, "learning_rate": 6.370221639516429e-05, "loss": 0.3255, "step": 948000 }, { "epoch": 23.77, "learning_rate": 6.294484846750807e-05, "loss": 0.3255, "step": 951000 }, { "epoch": 23.84, "learning_rate": 6.218748053985185e-05, "loss": 0.3233, "step": 954000 }, { "epoch": 23.92, "learning_rate": 6.143011261219564e-05, "loss": 0.3267, "step": 957000 }, { "epoch": 23.99, "learning_rate": 6.067274468453942e-05, "loss": 0.3234, "step": 960000 }, { "epoch": 24.07, "learning_rate": 5.991537675688321e-05, "loss": 0.3174, "step": 963000 }, { "epoch": 24.14, "learning_rate": 5.9158008829226994e-05, "loss": 0.3149, "step": 966000 }, { "epoch": 24.22, "learning_rate": 5.840064090157077e-05, "loss": 0.314, "step": 969000 }, { "epoch": 24.29, "learning_rate": 5.764327297391456e-05, "loss": 0.318, "step": 972000 }, { "epoch": 24.37, "learning_rate": 5.6885905046258344e-05, "loss": 0.3147, "step": 975000 }, { "epoch": 24.44, "learning_rate": 5.6128537118602134e-05, "loss": 0.3158, "step": 978000 }, { "epoch": 24.52, "learning_rate": 5.537116919094591e-05, "loss": 0.3164, "step": 981000 }, { "epoch": 24.59, "learning_rate": 5.46138012632897e-05, "loss": 0.3162, "step": 984000 }, { "epoch": 24.67, "learning_rate": 5.3856433335633483e-05, "loss": 0.3153, "step": 987000 }, { "epoch": 24.74, "learning_rate": 5.3099065407977274e-05, "loss": 0.3165, "step": 990000 }, { "epoch": 24.82, "learning_rate": 5.234169748032105e-05, "loss": 0.315, "step": 993000 }, { "epoch": 24.89, "learning_rate": 5.158432955266483e-05, "loss": 0.3132, "step": 996000 }, { "epoch": 24.97, "learning_rate": 5.0826961625008623e-05, "loss": 0.3163, "step": 999000 }, { "epoch": 25.04, "learning_rate": 5.006959369735241e-05, "loss": 0.3107, "step": 1002000 }, { "epoch": 25.12, "learning_rate": 4.931222576969619e-05, "loss": 0.3062, "step": 1005000 }, { "epoch": 25.19, "learning_rate": 4.855485784203997e-05, "loss": 0.3083, "step": 1008000 }, { "epoch": 25.27, "learning_rate": 4.779748991438376e-05, "loss": 0.3096, "step": 1011000 }, { "epoch": 25.34, "learning_rate": 4.704012198672755e-05, "loss": 0.3056, "step": 1014000 }, { "epoch": 25.42, "learning_rate": 4.628275405907132e-05, "loss": 0.3042, "step": 1017000 }, { "epoch": 25.49, "learning_rate": 4.552538613141511e-05, "loss": 0.3063, "step": 1020000 }, { "epoch": 25.57, "learning_rate": 4.4768018203758897e-05, "loss": 0.3025, "step": 1023000 }, { "epoch": 25.64, "learning_rate": 4.4010650276102687e-05, "loss": 0.3056, "step": 1026000 }, { "epoch": 25.72, "learning_rate": 4.325328234844646e-05, "loss": 0.3081, "step": 1029000 }, { "epoch": 25.79, "learning_rate": 4.249591442079025e-05, "loss": 0.3082, "step": 1032000 }, { "epoch": 25.87, "learning_rate": 4.1738546493134036e-05, "loss": 0.3054, "step": 1035000 }, { "epoch": 25.94, "learning_rate": 4.0981178565477827e-05, "loss": 0.3052, "step": 1038000 }, { "epoch": 26.02, "learning_rate": 4.02238106378216e-05, "loss": 0.3013, "step": 1041000 }, { "epoch": 26.09, "learning_rate": 3.9466442710165386e-05, "loss": 0.2982, "step": 1044000 }, { "epoch": 26.17, "learning_rate": 3.8709074782509176e-05, "loss": 0.2962, "step": 1047000 }, { "epoch": 26.24, "learning_rate": 3.795170685485296e-05, "loss": 0.2961, "step": 1050000 }, { "epoch": 26.32, "learning_rate": 3.719433892719674e-05, "loss": 0.2993, "step": 1053000 }, { "epoch": 26.39, "learning_rate": 3.6436970999540526e-05, "loss": 0.2941, "step": 1056000 }, { "epoch": 26.47, "learning_rate": 3.5679603071884316e-05, "loss": 0.2948, "step": 1059000 }, { "epoch": 26.54, "learning_rate": 3.492223514422809e-05, "loss": 0.2966, "step": 1062000 }, { "epoch": 26.62, "learning_rate": 3.4164867216571876e-05, "loss": 0.2964, "step": 1065000 }, { "epoch": 26.69, "learning_rate": 3.3407499288915666e-05, "loss": 0.2971, "step": 1068000 }, { "epoch": 26.77, "learning_rate": 3.265013136125945e-05, "loss": 0.2974, "step": 1071000 }, { "epoch": 26.84, "learning_rate": 3.189276343360323e-05, "loss": 0.2963, "step": 1074000 }, { "epoch": 26.92, "learning_rate": 3.1135395505947016e-05, "loss": 0.2954, "step": 1077000 }, { "epoch": 26.99, "learning_rate": 3.0378027578290803e-05, "loss": 0.2941, "step": 1080000 }, { "epoch": 27.07, "learning_rate": 2.9620659650634586e-05, "loss": 0.2896, "step": 1083000 }, { "epoch": 27.14, "learning_rate": 2.8863291722978373e-05, "loss": 0.289, "step": 1086000 }, { "epoch": 27.22, "learning_rate": 2.8105923795322156e-05, "loss": 0.2899, "step": 1089000 }, { "epoch": 27.29, "learning_rate": 2.7348555867665943e-05, "loss": 0.2878, "step": 1092000 }, { "epoch": 27.37, "learning_rate": 2.6591187940009723e-05, "loss": 0.2858, "step": 1095000 }, { "epoch": 27.44, "learning_rate": 2.583382001235351e-05, "loss": 0.2901, "step": 1098000 }, { "epoch": 27.52, "learning_rate": 2.5076452084697293e-05, "loss": 0.2885, "step": 1101000 }, { "epoch": 27.59, "learning_rate": 2.431908415704108e-05, "loss": 0.2903, "step": 1104000 }, { "epoch": 27.67, "learning_rate": 2.3561716229384863e-05, "loss": 0.2904, "step": 1107000 }, { "epoch": 27.74, "learning_rate": 2.280434830172865e-05, "loss": 0.2869, "step": 1110000 }, { "epoch": 27.82, "learning_rate": 2.2046980374072432e-05, "loss": 0.2902, "step": 1113000 }, { "epoch": 27.89, "learning_rate": 2.128961244641622e-05, "loss": 0.2867, "step": 1116000 }, { "epoch": 27.97, "learning_rate": 2.053224451876e-05, "loss": 0.2869, "step": 1119000 }, { "epoch": 28.04, "learning_rate": 1.9774876591103786e-05, "loss": 0.2844, "step": 1122000 }, { "epoch": 28.12, "learning_rate": 1.901750866344757e-05, "loss": 0.283, "step": 1125000 }, { "epoch": 28.19, "learning_rate": 1.8260140735791356e-05, "loss": 0.2816, "step": 1128000 }, { "epoch": 28.27, "learning_rate": 1.750277280813514e-05, "loss": 0.2848, "step": 1131000 }, { "epoch": 28.34, "learning_rate": 1.6745404880478922e-05, "loss": 0.2811, "step": 1134000 }, { "epoch": 28.42, "learning_rate": 1.598803695282271e-05, "loss": 0.283, "step": 1137000 }, { "epoch": 28.49, "learning_rate": 1.5230669025166492e-05, "loss": 0.2814, "step": 1140000 }, { "epoch": 28.57, "learning_rate": 1.4473301097510277e-05, "loss": 0.2793, "step": 1143000 }, { "epoch": 28.64, "learning_rate": 1.3715933169854062e-05, "loss": 0.2834, "step": 1146000 }, { "epoch": 28.72, "learning_rate": 1.2958565242197847e-05, "loss": 0.2798, "step": 1149000 }, { "epoch": 28.79, "learning_rate": 1.220119731454163e-05, "loss": 0.2807, "step": 1152000 }, { "epoch": 28.87, "learning_rate": 1.1443829386885416e-05, "loss": 0.28, "step": 1155000 }, { "epoch": 28.94, "learning_rate": 1.06864614592292e-05, "loss": 0.2762, "step": 1158000 }, { "epoch": 29.02, "learning_rate": 9.929093531572985e-06, "loss": 0.2802, "step": 1161000 }, { "epoch": 29.09, "learning_rate": 9.171725603916769e-06, "loss": 0.2761, "step": 1164000 }, { "epoch": 29.17, "learning_rate": 8.414357676260554e-06, "loss": 0.2771, "step": 1167000 }, { "epoch": 29.24, "learning_rate": 7.656989748604339e-06, "loss": 0.2777, "step": 1170000 }, { "epoch": 29.32, "learning_rate": 6.899621820948123e-06, "loss": 0.2767, "step": 1173000 }, { "epoch": 29.39, "learning_rate": 6.142253893291908e-06, "loss": 0.2773, "step": 1176000 }, { "epoch": 29.47, "learning_rate": 5.384885965635692e-06, "loss": 0.2765, "step": 1179000 }, { "epoch": 29.54, "learning_rate": 4.627518037979477e-06, "loss": 0.2786, "step": 1182000 }, { "epoch": 29.62, "learning_rate": 3.870150110323261e-06, "loss": 0.2768, "step": 1185000 }, { "epoch": 29.69, "learning_rate": 3.1127821826670457e-06, "loss": 0.2771, "step": 1188000 }, { "epoch": 29.77, "learning_rate": 2.35541425501083e-06, "loss": 0.2737, "step": 1191000 }, { "epoch": 29.84, "learning_rate": 1.5980463273546148e-06, "loss": 0.2737, "step": 1194000 }, { "epoch": 29.92, "learning_rate": 8.406783996983991e-07, "loss": 0.2781, "step": 1197000 }, { "epoch": 29.99, "learning_rate": 8.33104720421837e-08, "loss": 0.2745, "step": 1200000 } ], "logging_steps": 3000, "max_steps": 1200330, "num_train_epochs": 30, "save_steps": 500, "total_flos": 2.468458772043806e+21, "trial_name": null, "trial_params": null }