diff --git "a/checkpoint-80000/trainer_state.json" "b/checkpoint-80000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-80000/trainer_state.json" @@ -0,0 +1,4949 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.7812544311575365, + "eval_steps": 5290, + "global_step": 80000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "eval_loss": 1.797255277633667, + "eval_runtime": 160.9316, + "eval_samples_per_second": 5.773, + "eval_steps_per_second": 5.773, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.9999916262476826e-06, + "loss": 2.339, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 2.9999626800634057e-06, + "loss": 2.1788, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 2.9999130584664085e-06, + "loss": 2.1946, + "step": 300 + }, + { + "epoch": 0.02, + "learning_rate": 2.9998427621406735e-06, + "loss": 2.2431, + "step": 400 + }, + { + "epoch": 0.02, + "learning_rate": 2.9997517920551614e-06, + "loss": 2.1155, + "step": 500 + }, + { + "epoch": 0.03, + "learning_rate": 2.9996401494637996e-06, + "loss": 2.0998, + "step": 600 + }, + { + "epoch": 0.03, + "learning_rate": 2.9995078359054642e-06, + "loss": 2.0592, + "step": 700 + }, + { + "epoch": 0.04, + "learning_rate": 2.999354853203959e-06, + "loss": 2.0821, + "step": 800 + }, + { + "epoch": 0.04, + "learning_rate": 2.9991812034679892e-06, + "loss": 1.8844, + "step": 900 + }, + { + "epoch": 0.05, + "learning_rate": 2.9989868890911354e-06, + "loss": 2.1784, + "step": 1000 + }, + { + "epoch": 0.05, + "learning_rate": 2.9987719127518173e-06, + "loss": 2.0341, + "step": 1100 + }, + { + "epoch": 0.06, + "learning_rate": 2.9985362774132576e-06, + "loss": 2.1155, + "step": 1200 + }, + { + "epoch": 0.06, + "learning_rate": 2.9982799863234435e-06, + "loss": 2.0074, + "step": 1300 + }, + { + "epoch": 0.07, + "learning_rate": 2.998003043015078e-06, + "loss": 2.0324, + "step": 1400 + }, + { + "epoch": 0.07, + "learning_rate": 2.9977054513055346e-06, + "loss": 1.9387, + "step": 1500 + }, + { + "epoch": 0.08, + "learning_rate": 2.997387215296803e-06, + "loss": 2.0548, + "step": 1600 + }, + { + "epoch": 0.08, + "learning_rate": 2.997048339375433e-06, + "loss": 2.0709, + "step": 1700 + }, + { + "epoch": 0.09, + "learning_rate": 2.9966888282124733e-06, + "loss": 2.009, + "step": 1800 + }, + { + "epoch": 0.09, + "learning_rate": 2.9963086867634087e-06, + "loss": 1.9616, + "step": 1900 + }, + { + "epoch": 0.09, + "learning_rate": 2.9959079202680905e-06, + "loss": 1.983, + "step": 2000 + }, + { + "epoch": 0.1, + "learning_rate": 2.9954865342506646e-06, + "loss": 2.0902, + "step": 2100 + }, + { + "epoch": 0.1, + "learning_rate": 2.9950445345194956e-06, + "loss": 2.1337, + "step": 2200 + }, + { + "epoch": 0.11, + "learning_rate": 2.994581927167085e-06, + "loss": 1.9246, + "step": 2300 + }, + { + "epoch": 0.11, + "learning_rate": 2.994098718569992e-06, + "loss": 2.0217, + "step": 2400 + }, + { + "epoch": 0.12, + "learning_rate": 2.9935949153887393e-06, + "loss": 2.0509, + "step": 2500 + }, + { + "epoch": 0.12, + "learning_rate": 2.993070524567726e-06, + "loss": 2.013, + "step": 2600 + }, + { + "epoch": 0.13, + "learning_rate": 2.992525553335129e-06, + "loss": 1.8444, + "step": 2700 + }, + { + "epoch": 0.13, + "learning_rate": 2.991960009202806e-06, + "loss": 1.9667, + "step": 2800 + }, + { + "epoch": 0.14, + "learning_rate": 2.9913738999661895e-06, + "loss": 1.9942, + "step": 2900 + }, + { + "epoch": 0.14, + "learning_rate": 2.990767233704181e-06, + "loss": 1.975, + "step": 3000 + }, + { + "epoch": 0.15, + "learning_rate": 2.9901400187790383e-06, + "loss": 2.015, + "step": 3100 + }, + { + "epoch": 0.15, + "learning_rate": 2.989492263836262e-06, + "loss": 2.122, + "step": 3200 + }, + { + "epoch": 0.16, + "learning_rate": 2.9888239778044748e-06, + "loss": 1.8877, + "step": 3300 + }, + { + "epoch": 0.16, + "learning_rate": 2.988135169895298e-06, + "loss": 2.0659, + "step": 3400 + }, + { + "epoch": 0.17, + "learning_rate": 2.9874258496032273e-06, + "loss": 1.8897, + "step": 3500 + }, + { + "epoch": 0.17, + "learning_rate": 2.9866960267054987e-06, + "loss": 1.9466, + "step": 3600 + }, + { + "epoch": 0.17, + "learning_rate": 2.985945711261956e-06, + "loss": 1.9438, + "step": 3700 + }, + { + "epoch": 0.18, + "learning_rate": 2.9851749136149105e-06, + "loss": 2.0251, + "step": 3800 + }, + { + "epoch": 0.18, + "learning_rate": 2.984383644388999e-06, + "loss": 2.0244, + "step": 3900 + }, + { + "epoch": 0.19, + "learning_rate": 2.9835719144910395e-06, + "loss": 1.9022, + "step": 4000 + }, + { + "epoch": 0.19, + "learning_rate": 2.982739735109876e-06, + "loss": 2.0163, + "step": 4100 + }, + { + "epoch": 0.2, + "learning_rate": 2.98188711771623e-06, + "loss": 1.9168, + "step": 4200 + }, + { + "epoch": 0.2, + "learning_rate": 2.9810140740625364e-06, + "loss": 1.9695, + "step": 4300 + }, + { + "epoch": 0.21, + "learning_rate": 2.9801206161827883e-06, + "loss": 1.9114, + "step": 4400 + }, + { + "epoch": 0.21, + "learning_rate": 2.9792067563923653e-06, + "loss": 2.0469, + "step": 4500 + }, + { + "epoch": 0.22, + "learning_rate": 2.9782725072878657e-06, + "loss": 1.8072, + "step": 4600 + }, + { + "epoch": 0.22, + "learning_rate": 2.9773178817469342e-06, + "loss": 1.8899, + "step": 4700 + }, + { + "epoch": 0.23, + "learning_rate": 2.976342892928083e-06, + "loss": 1.9418, + "step": 4800 + }, + { + "epoch": 0.23, + "learning_rate": 2.9753475542705106e-06, + "loss": 2.1559, + "step": 4900 + }, + { + "epoch": 0.24, + "learning_rate": 2.974331879493916e-06, + "loss": 2.001, + "step": 5000 + }, + { + "epoch": 0.24, + "learning_rate": 2.973295882598313e-06, + "loss": 2.051, + "step": 5100 + }, + { + "epoch": 0.25, + "learning_rate": 2.9722395778638296e-06, + "loss": 1.9767, + "step": 5200 + }, + { + "epoch": 0.25, + "eval_loss": 1.4832066297531128, + "eval_runtime": 162.8547, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 5290 + }, + { + "epoch": 0.25, + "learning_rate": 2.971162979850521e-06, + "loss": 1.8538, + "step": 5300 + }, + { + "epoch": 0.26, + "learning_rate": 2.9700661033981615e-06, + "loss": 1.7968, + "step": 5400 + }, + { + "epoch": 0.26, + "learning_rate": 2.9689489636260424e-06, + "loss": 1.7703, + "step": 5500 + }, + { + "epoch": 0.26, + "learning_rate": 2.967811575932764e-06, + "loss": 1.9824, + "step": 5600 + }, + { + "epoch": 0.27, + "learning_rate": 2.9666539559960238e-06, + "loss": 1.9332, + "step": 5700 + }, + { + "epoch": 0.27, + "learning_rate": 2.965476119772398e-06, + "loss": 1.8362, + "step": 5800 + }, + { + "epoch": 0.28, + "learning_rate": 2.964278083497125e-06, + "loss": 1.7958, + "step": 5900 + }, + { + "epoch": 0.28, + "learning_rate": 2.963059863683877e-06, + "loss": 1.7677, + "step": 6000 + }, + { + "epoch": 0.29, + "learning_rate": 2.9618214771245376e-06, + "loss": 2.0132, + "step": 6100 + }, + { + "epoch": 0.29, + "learning_rate": 2.9605629408889673e-06, + "loss": 1.8406, + "step": 6200 + }, + { + "epoch": 0.3, + "learning_rate": 2.9592842723247676e-06, + "loss": 2.0235, + "step": 6300 + }, + { + "epoch": 0.3, + "learning_rate": 2.9579854890570448e-06, + "loss": 1.9383, + "step": 6400 + }, + { + "epoch": 0.31, + "learning_rate": 2.956666608988164e-06, + "loss": 1.8556, + "step": 6500 + }, + { + "epoch": 0.31, + "learning_rate": 2.9553276502975034e-06, + "loss": 1.8689, + "step": 6600 + }, + { + "epoch": 0.32, + "learning_rate": 2.9539686314412053e-06, + "loss": 2.0381, + "step": 6700 + }, + { + "epoch": 0.32, + "learning_rate": 2.9525895711519195e-06, + "loss": 2.0205, + "step": 6800 + }, + { + "epoch": 0.33, + "learning_rate": 2.951190488438546e-06, + "loss": 1.8647, + "step": 6900 + }, + { + "epoch": 0.33, + "learning_rate": 2.9497714025859727e-06, + "loss": 1.8074, + "step": 7000 + }, + { + "epoch": 0.34, + "learning_rate": 2.94833233315481e-06, + "loss": 1.9039, + "step": 7100 + }, + { + "epoch": 0.34, + "learning_rate": 2.9468732999811216e-06, + "loss": 2.0103, + "step": 7200 + }, + { + "epoch": 0.35, + "learning_rate": 2.94539432317615e-06, + "loss": 1.9635, + "step": 7300 + }, + { + "epoch": 0.35, + "learning_rate": 2.943895423126038e-06, + "loss": 1.8708, + "step": 7400 + }, + { + "epoch": 0.35, + "learning_rate": 2.942376620491553e-06, + "loss": 1.7572, + "step": 7500 + }, + { + "epoch": 0.36, + "learning_rate": 2.940837936207796e-06, + "loss": 1.9795, + "step": 7600 + }, + { + "epoch": 0.36, + "learning_rate": 2.9392793914839165e-06, + "loss": 2.0192, + "step": 7700 + }, + { + "epoch": 0.37, + "learning_rate": 2.937701007802819e-06, + "loss": 1.8849, + "step": 7800 + }, + { + "epoch": 0.37, + "learning_rate": 2.9361028069208675e-06, + "loss": 1.9925, + "step": 7900 + }, + { + "epoch": 0.38, + "learning_rate": 2.934484810867586e-06, + "loss": 2.004, + "step": 8000 + }, + { + "epoch": 0.38, + "learning_rate": 2.9328470419453527e-06, + "loss": 1.9084, + "step": 8100 + }, + { + "epoch": 0.39, + "learning_rate": 2.9311895227290954e-06, + "loss": 1.8507, + "step": 8200 + }, + { + "epoch": 0.39, + "learning_rate": 2.929512276065978e-06, + "loss": 1.8185, + "step": 8300 + }, + { + "epoch": 0.4, + "learning_rate": 2.9278153250750875e-06, + "loss": 1.7862, + "step": 8400 + }, + { + "epoch": 0.4, + "learning_rate": 2.9260986931471136e-06, + "loss": 1.8444, + "step": 8500 + }, + { + "epoch": 0.41, + "learning_rate": 2.924362403944027e-06, + "loss": 2.0304, + "step": 8600 + }, + { + "epoch": 0.41, + "learning_rate": 2.922606481398755e-06, + "loss": 1.9337, + "step": 8700 + }, + { + "epoch": 0.42, + "learning_rate": 2.920830949714848e-06, + "loss": 1.9937, + "step": 8800 + }, + { + "epoch": 0.42, + "learning_rate": 2.919035833366148e-06, + "loss": 1.9554, + "step": 8900 + }, + { + "epoch": 0.43, + "learning_rate": 2.917221157096452e-06, + "loss": 1.9068, + "step": 9000 + }, + { + "epoch": 0.43, + "learning_rate": 2.9153869459191693e-06, + "loss": 1.9063, + "step": 9100 + }, + { + "epoch": 0.43, + "learning_rate": 2.913533225116978e-06, + "loss": 1.9342, + "step": 9200 + }, + { + "epoch": 0.44, + "learning_rate": 2.9116600202414754e-06, + "loss": 2.0052, + "step": 9300 + }, + { + "epoch": 0.44, + "learning_rate": 2.9097673571128266e-06, + "loss": 1.8102, + "step": 9400 + }, + { + "epoch": 0.45, + "learning_rate": 2.9078552618194086e-06, + "loss": 1.9959, + "step": 9500 + }, + { + "epoch": 0.45, + "learning_rate": 2.9059237607174494e-06, + "loss": 1.9136, + "step": 9600 + }, + { + "epoch": 0.46, + "learning_rate": 2.9039728804306666e-06, + "loss": 1.9124, + "step": 9700 + }, + { + "epoch": 0.46, + "learning_rate": 2.9020026478498988e-06, + "loss": 1.9215, + "step": 9800 + }, + { + "epoch": 0.47, + "learning_rate": 2.9000130901327377e-06, + "loss": 1.93, + "step": 9900 + }, + { + "epoch": 0.47, + "learning_rate": 2.8980042347031482e-06, + "loss": 1.82, + "step": 10000 + }, + { + "epoch": 0.48, + "learning_rate": 2.8959761092510978e-06, + "loss": 1.8436, + "step": 10100 + }, + { + "epoch": 0.48, + "learning_rate": 2.8939287417321676e-06, + "loss": 1.8995, + "step": 10200 + }, + { + "epoch": 0.49, + "learning_rate": 2.8918621603671737e-06, + "loss": 1.9337, + "step": 10300 + }, + { + "epoch": 0.49, + "learning_rate": 2.8897763936417715e-06, + "loss": 1.9088, + "step": 10400 + }, + { + "epoch": 0.5, + "learning_rate": 2.88767147030607e-06, + "loss": 1.8474, + "step": 10500 + }, + { + "epoch": 0.5, + "eval_loss": 1.4355759620666504, + "eval_runtime": 163.1072, + "eval_samples_per_second": 5.696, + "eval_steps_per_second": 5.696, + "step": 10580 + }, + { + "epoch": 0.5, + "learning_rate": 2.885547419374229e-06, + "loss": 1.9638, + "step": 10600 + }, + { + "epoch": 0.51, + "learning_rate": 2.883404270124063e-06, + "loss": 1.9945, + "step": 10700 + }, + { + "epoch": 0.51, + "learning_rate": 2.881242052096638e-06, + "loss": 1.8143, + "step": 10800 + }, + { + "epoch": 0.52, + "learning_rate": 2.879060795095863e-06, + "loss": 1.7915, + "step": 10900 + }, + { + "epoch": 0.52, + "learning_rate": 2.8768605291880767e-06, + "loss": 1.8868, + "step": 11000 + }, + { + "epoch": 0.52, + "learning_rate": 2.8746412847016387e-06, + "loss": 1.8033, + "step": 11100 + }, + { + "epoch": 0.53, + "learning_rate": 2.8724030922265068e-06, + "loss": 2.0053, + "step": 11200 + }, + { + "epoch": 0.53, + "learning_rate": 2.870145982613818e-06, + "loss": 1.867, + "step": 11300 + }, + { + "epoch": 0.54, + "learning_rate": 2.867869986975461e-06, + "loss": 1.8002, + "step": 11400 + }, + { + "epoch": 0.54, + "learning_rate": 2.865575136683649e-06, + "loss": 1.8835, + "step": 11500 + }, + { + "epoch": 0.55, + "learning_rate": 2.863261463370487e-06, + "loss": 1.7312, + "step": 11600 + }, + { + "epoch": 0.55, + "learning_rate": 2.8609289989275353e-06, + "loss": 1.8402, + "step": 11700 + }, + { + "epoch": 0.56, + "learning_rate": 2.858577775505371e-06, + "loss": 1.9007, + "step": 11800 + }, + { + "epoch": 0.56, + "learning_rate": 2.856207825513144e-06, + "loss": 1.8235, + "step": 11900 + }, + { + "epoch": 0.57, + "learning_rate": 2.853819181618129e-06, + "loss": 1.8568, + "step": 12000 + }, + { + "epoch": 0.57, + "learning_rate": 2.851411876745278e-06, + "loss": 1.9159, + "step": 12100 + }, + { + "epoch": 0.58, + "learning_rate": 2.848985944076763e-06, + "loss": 1.9857, + "step": 12200 + }, + { + "epoch": 0.58, + "learning_rate": 2.846541417051524e-06, + "loss": 1.8676, + "step": 12300 + }, + { + "epoch": 0.59, + "learning_rate": 2.8440783293648015e-06, + "loss": 1.8022, + "step": 12400 + }, + { + "epoch": 0.59, + "learning_rate": 2.8415967149676773e-06, + "loss": 1.8365, + "step": 12500 + }, + { + "epoch": 0.6, + "learning_rate": 2.8390966080666035e-06, + "loss": 1.8702, + "step": 12600 + }, + { + "epoch": 0.6, + "learning_rate": 2.8365780431229317e-06, + "loss": 1.8221, + "step": 12700 + }, + { + "epoch": 0.61, + "learning_rate": 2.8340410548524395e-06, + "loss": 1.8498, + "step": 12800 + }, + { + "epoch": 0.61, + "learning_rate": 2.8314856782248494e-06, + "loss": 1.8906, + "step": 12900 + }, + { + "epoch": 0.61, + "learning_rate": 2.8289119484633485e-06, + "loss": 2.0184, + "step": 13000 + }, + { + "epoch": 0.62, + "learning_rate": 2.8263199010441038e-06, + "loss": 1.8205, + "step": 13100 + }, + { + "epoch": 0.62, + "learning_rate": 2.82370957169577e-06, + "loss": 1.9686, + "step": 13200 + }, + { + "epoch": 0.63, + "learning_rate": 2.8210809963990004e-06, + "loss": 1.7651, + "step": 13300 + }, + { + "epoch": 0.63, + "learning_rate": 2.8184342113859494e-06, + "loss": 1.8216, + "step": 13400 + }, + { + "epoch": 0.64, + "learning_rate": 2.815769253139773e-06, + "loss": 1.8081, + "step": 13500 + }, + { + "epoch": 0.64, + "learning_rate": 2.813086158394126e-06, + "loss": 1.7233, + "step": 13600 + }, + { + "epoch": 0.65, + "learning_rate": 2.8103849641326563e-06, + "loss": 1.8446, + "step": 13700 + }, + { + "epoch": 0.65, + "learning_rate": 2.807665707588494e-06, + "loss": 1.8379, + "step": 13800 + }, + { + "epoch": 0.66, + "learning_rate": 2.8049284262437393e-06, + "loss": 1.8149, + "step": 13900 + }, + { + "epoch": 0.66, + "learning_rate": 2.802173157828946e-06, + "loss": 1.9463, + "step": 14000 + }, + { + "epoch": 0.67, + "learning_rate": 2.799399940322599e-06, + "loss": 1.8382, + "step": 14100 + }, + { + "epoch": 0.67, + "learning_rate": 2.7966088119505945e-06, + "loss": 1.8039, + "step": 14200 + }, + { + "epoch": 0.68, + "learning_rate": 2.79379981118571e-06, + "loss": 2.0244, + "step": 14300 + }, + { + "epoch": 0.68, + "learning_rate": 2.7909729767470757e-06, + "loss": 1.8587, + "step": 14400 + }, + { + "epoch": 0.69, + "learning_rate": 2.7881283475996405e-06, + "loss": 1.8551, + "step": 14500 + }, + { + "epoch": 0.69, + "learning_rate": 2.7852659629536335e-06, + "loss": 1.9153, + "step": 14600 + }, + { + "epoch": 0.69, + "learning_rate": 2.782385862264027e-06, + "loss": 1.7548, + "step": 14700 + }, + { + "epoch": 0.7, + "learning_rate": 2.779488085229987e-06, + "loss": 1.8052, + "step": 14800 + }, + { + "epoch": 0.7, + "learning_rate": 2.7765726717943334e-06, + "loss": 1.7594, + "step": 14900 + }, + { + "epoch": 0.71, + "learning_rate": 2.773639662142983e-06, + "loss": 1.8186, + "step": 15000 + }, + { + "epoch": 0.71, + "learning_rate": 2.770689096704397e-06, + "loss": 1.9036, + "step": 15100 + }, + { + "epoch": 0.72, + "learning_rate": 2.7677210161490276e-06, + "loss": 1.8217, + "step": 15200 + }, + { + "epoch": 0.72, + "learning_rate": 2.7647354613887523e-06, + "loss": 1.8397, + "step": 15300 + }, + { + "epoch": 0.73, + "learning_rate": 2.761732473576313e-06, + "loss": 1.7251, + "step": 15400 + }, + { + "epoch": 0.73, + "learning_rate": 2.7587120941047475e-06, + "loss": 1.8731, + "step": 15500 + }, + { + "epoch": 0.74, + "learning_rate": 2.7556743646068202e-06, + "loss": 1.805, + "step": 15600 + }, + { + "epoch": 0.74, + "learning_rate": 2.752619326954447e-06, + "loss": 1.8677, + "step": 15700 + }, + { + "epoch": 0.75, + "learning_rate": 2.749547023258118e-06, + "loss": 1.8121, + "step": 15800 + }, + { + "epoch": 0.75, + "eval_loss": 1.4021737575531006, + "eval_runtime": 163.1438, + "eval_samples_per_second": 5.694, + "eval_steps_per_second": 5.694, + "step": 15870 + }, + { + "epoch": 0.75, + "learning_rate": 2.7464574958663186e-06, + "loss": 1.8015, + "step": 15900 + }, + { + "epoch": 0.76, + "learning_rate": 2.743350787364944e-06, + "loss": 1.7014, + "step": 16000 + }, + { + "epoch": 0.76, + "learning_rate": 2.7402269405767133e-06, + "loss": 1.7616, + "step": 16100 + }, + { + "epoch": 0.77, + "learning_rate": 2.7370859985605794e-06, + "loss": 1.7529, + "step": 16200 + }, + { + "epoch": 0.77, + "learning_rate": 2.7339280046111336e-06, + "loss": 1.7992, + "step": 16300 + }, + { + "epoch": 0.78, + "learning_rate": 2.7307530022580115e-06, + "loss": 1.5267, + "step": 16400 + }, + { + "epoch": 0.78, + "learning_rate": 2.7275610352652913e-06, + "loss": 1.6973, + "step": 16500 + }, + { + "epoch": 0.78, + "learning_rate": 2.7243521476308908e-06, + "loss": 1.813, + "step": 16600 + }, + { + "epoch": 0.79, + "learning_rate": 2.721126383585962e-06, + "loss": 1.842, + "step": 16700 + }, + { + "epoch": 0.79, + "learning_rate": 2.7178837875942787e-06, + "loss": 1.9349, + "step": 16800 + }, + { + "epoch": 0.8, + "learning_rate": 2.7146244043516273e-06, + "loss": 1.7218, + "step": 16900 + }, + { + "epoch": 0.8, + "learning_rate": 2.7113482787851883e-06, + "loss": 1.8096, + "step": 17000 + }, + { + "epoch": 0.81, + "learning_rate": 2.7080554560529164e-06, + "loss": 1.7827, + "step": 17100 + }, + { + "epoch": 0.81, + "learning_rate": 2.7047459815429214e-06, + "loss": 1.7434, + "step": 17200 + }, + { + "epoch": 0.82, + "learning_rate": 2.7014199008728377e-06, + "loss": 1.8203, + "step": 17300 + }, + { + "epoch": 0.82, + "learning_rate": 2.698077259889201e-06, + "loss": 1.7201, + "step": 17400 + }, + { + "epoch": 0.83, + "learning_rate": 2.6947181046668113e-06, + "loss": 1.8474, + "step": 17500 + }, + { + "epoch": 0.83, + "learning_rate": 2.691342481508102e-06, + "loss": 1.7868, + "step": 17600 + }, + { + "epoch": 0.84, + "learning_rate": 2.6879504369424983e-06, + "loss": 1.7272, + "step": 17700 + }, + { + "epoch": 0.84, + "learning_rate": 2.6845420177257774e-06, + "loss": 1.8764, + "step": 17800 + }, + { + "epoch": 0.85, + "learning_rate": 2.6811172708394243e-06, + "loss": 1.5964, + "step": 17900 + }, + { + "epoch": 0.85, + "learning_rate": 2.6776762434899845e-06, + "loss": 1.7725, + "step": 18000 + }, + { + "epoch": 0.86, + "learning_rate": 2.6742189831084106e-06, + "loss": 1.8118, + "step": 18100 + }, + { + "epoch": 0.86, + "learning_rate": 2.6707455373494125e-06, + "loss": 1.6714, + "step": 18200 + }, + { + "epoch": 0.86, + "learning_rate": 2.667255954090798e-06, + "loss": 1.7673, + "step": 18300 + }, + { + "epoch": 0.87, + "learning_rate": 2.6637502814328124e-06, + "loss": 1.8517, + "step": 18400 + }, + { + "epoch": 0.87, + "learning_rate": 2.6602285676974786e-06, + "loss": 1.7459, + "step": 18500 + }, + { + "epoch": 0.88, + "learning_rate": 2.6566908614279262e-06, + "loss": 1.8677, + "step": 18600 + }, + { + "epoch": 0.88, + "learning_rate": 2.6531372113877273e-06, + "loss": 1.8378, + "step": 18700 + }, + { + "epoch": 0.89, + "learning_rate": 2.649567666560222e-06, + "loss": 1.7712, + "step": 18800 + }, + { + "epoch": 0.89, + "learning_rate": 2.645982276147842e-06, + "loss": 1.7846, + "step": 18900 + }, + { + "epoch": 0.9, + "learning_rate": 2.6423810895714345e-06, + "loss": 1.7452, + "step": 19000 + }, + { + "epoch": 0.9, + "learning_rate": 2.6387641564695807e-06, + "loss": 1.8064, + "step": 19100 + }, + { + "epoch": 0.91, + "learning_rate": 2.635131526697911e-06, + "loss": 1.6403, + "step": 19200 + }, + { + "epoch": 0.91, + "learning_rate": 2.631483250328417e-06, + "loss": 1.7232, + "step": 19300 + }, + { + "epoch": 0.92, + "learning_rate": 2.627819377648764e-06, + "loss": 1.8836, + "step": 19400 + }, + { + "epoch": 0.92, + "learning_rate": 2.6241399591615938e-06, + "loss": 1.8373, + "step": 19500 + }, + { + "epoch": 0.93, + "learning_rate": 2.620445045583833e-06, + "loss": 1.7807, + "step": 19600 + }, + { + "epoch": 0.93, + "learning_rate": 2.6167346878459907e-06, + "loss": 1.8299, + "step": 19700 + }, + { + "epoch": 0.94, + "learning_rate": 2.6130089370914575e-06, + "loss": 1.8572, + "step": 19800 + }, + { + "epoch": 0.94, + "learning_rate": 2.609267844675801e-06, + "loss": 1.651, + "step": 19900 + }, + { + "epoch": 0.95, + "learning_rate": 2.605511462166057e-06, + "loss": 1.8989, + "step": 20000 + }, + { + "epoch": 0.95, + "learning_rate": 2.6017398413400198e-06, + "loss": 1.8421, + "step": 20100 + }, + { + "epoch": 0.95, + "learning_rate": 2.597953034185528e-06, + "loss": 1.8114, + "step": 20200 + }, + { + "epoch": 0.96, + "learning_rate": 2.5941510928997473e-06, + "loss": 1.8759, + "step": 20300 + }, + { + "epoch": 0.96, + "learning_rate": 2.590334069888451e-06, + "loss": 1.8544, + "step": 20400 + }, + { + "epoch": 0.97, + "learning_rate": 2.5865020177652995e-06, + "loss": 1.717, + "step": 20500 + }, + { + "epoch": 0.97, + "learning_rate": 2.5826549893511133e-06, + "loss": 1.7786, + "step": 20600 + }, + { + "epoch": 0.98, + "learning_rate": 2.578793037673145e-06, + "loss": 1.6818, + "step": 20700 + }, + { + "epoch": 0.98, + "learning_rate": 2.574916215964348e-06, + "loss": 1.6679, + "step": 20800 + }, + { + "epoch": 0.99, + "learning_rate": 2.5710245776626463e-06, + "loss": 1.8773, + "step": 20900 + }, + { + "epoch": 0.99, + "learning_rate": 2.5671181764101916e-06, + "loss": 1.6672, + "step": 21000 + }, + { + "epoch": 1.0, + "learning_rate": 2.56319706605263e-06, + "loss": 1.8333, + "step": 21100 + }, + { + "epoch": 1.0, + "eval_loss": 1.367815613746643, + "eval_runtime": 161.7042, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 5.745, + "step": 21160 + }, + { + "epoch": 1.0, + "learning_rate": 2.5592613006383554e-06, + "loss": 1.873, + "step": 21200 + }, + { + "epoch": 1.01, + "learning_rate": 2.5553109344177676e-06, + "loss": 1.7398, + "step": 21300 + }, + { + "epoch": 1.01, + "learning_rate": 2.5513460218425225e-06, + "loss": 1.8562, + "step": 21400 + }, + { + "epoch": 1.02, + "learning_rate": 2.5473666175647824e-06, + "loss": 1.8687, + "step": 21500 + }, + { + "epoch": 1.02, + "learning_rate": 2.543372776436463e-06, + "loss": 1.8159, + "step": 21600 + }, + { + "epoch": 1.03, + "learning_rate": 2.539364553508476e-06, + "loss": 1.7736, + "step": 21700 + }, + { + "epoch": 1.03, + "learning_rate": 2.5353420040299714e-06, + "loss": 1.8746, + "step": 21800 + }, + { + "epoch": 1.04, + "learning_rate": 2.531305183447576e-06, + "loss": 1.7582, + "step": 21900 + }, + { + "epoch": 1.04, + "learning_rate": 2.527254147404629e-06, + "loss": 1.9113, + "step": 22000 + }, + { + "epoch": 1.04, + "learning_rate": 2.5231889517404136e-06, + "loss": 1.8019, + "step": 22100 + }, + { + "epoch": 1.05, + "learning_rate": 2.5191096524893894e-06, + "loss": 1.8494, + "step": 22200 + }, + { + "epoch": 1.05, + "learning_rate": 2.5150163058804203e-06, + "loss": 1.698, + "step": 22300 + }, + { + "epoch": 1.06, + "learning_rate": 2.5109089683359967e-06, + "loss": 1.7218, + "step": 22400 + }, + { + "epoch": 1.06, + "learning_rate": 2.5067876964714582e-06, + "loss": 1.7944, + "step": 22500 + }, + { + "epoch": 1.07, + "learning_rate": 2.502652547094218e-06, + "loss": 1.8057, + "step": 22600 + }, + { + "epoch": 1.07, + "learning_rate": 2.4985035772029737e-06, + "loss": 1.677, + "step": 22700 + }, + { + "epoch": 1.08, + "learning_rate": 2.4943408439869243e-06, + "loss": 1.8319, + "step": 22800 + }, + { + "epoch": 1.08, + "learning_rate": 2.490164404824983e-06, + "loss": 1.742, + "step": 22900 + }, + { + "epoch": 1.09, + "learning_rate": 2.485974317284983e-06, + "loss": 1.7521, + "step": 23000 + }, + { + "epoch": 1.09, + "learning_rate": 2.4817706391228884e-06, + "loss": 1.8927, + "step": 23100 + }, + { + "epoch": 1.1, + "learning_rate": 2.4775534282819945e-06, + "loss": 1.6825, + "step": 23200 + }, + { + "epoch": 1.1, + "learning_rate": 2.473322742892131e-06, + "loss": 1.7289, + "step": 23300 + }, + { + "epoch": 1.11, + "learning_rate": 2.4690786412688594e-06, + "loss": 1.8572, + "step": 23400 + }, + { + "epoch": 1.11, + "learning_rate": 2.4648211819126706e-06, + "loss": 1.7959, + "step": 23500 + }, + { + "epoch": 1.12, + "learning_rate": 2.460550423508178e-06, + "loss": 1.765, + "step": 23600 + }, + { + "epoch": 1.12, + "learning_rate": 2.4562664249233064e-06, + "loss": 1.7334, + "step": 23700 + }, + { + "epoch": 1.12, + "learning_rate": 2.451969245208486e-06, + "loss": 1.6651, + "step": 23800 + }, + { + "epoch": 1.13, + "learning_rate": 2.4476589435958323e-06, + "loss": 1.7472, + "step": 23900 + }, + { + "epoch": 1.13, + "learning_rate": 2.4433355794983336e-06, + "loss": 1.8278, + "step": 24000 + }, + { + "epoch": 1.14, + "learning_rate": 2.43899921250903e-06, + "loss": 1.6537, + "step": 24100 + }, + { + "epoch": 1.14, + "learning_rate": 2.4346499024001946e-06, + "loss": 1.6281, + "step": 24200 + }, + { + "epoch": 1.15, + "learning_rate": 2.430287709122506e-06, + "loss": 1.8405, + "step": 24300 + }, + { + "epoch": 1.15, + "learning_rate": 2.425912692804224e-06, + "loss": 1.7661, + "step": 24400 + }, + { + "epoch": 1.16, + "learning_rate": 2.4215249137503624e-06, + "loss": 1.7644, + "step": 24500 + }, + { + "epoch": 1.16, + "learning_rate": 2.417124432441853e-06, + "loss": 1.6826, + "step": 24600 + }, + { + "epoch": 1.17, + "learning_rate": 2.412711309534717e-06, + "loss": 1.7262, + "step": 24700 + }, + { + "epoch": 1.17, + "learning_rate": 2.4082856058592265e-06, + "loss": 1.8845, + "step": 24800 + }, + { + "epoch": 1.18, + "learning_rate": 2.4038473824190656e-06, + "loss": 1.922, + "step": 24900 + }, + { + "epoch": 1.18, + "learning_rate": 2.399396700390491e-06, + "loss": 1.703, + "step": 25000 + }, + { + "epoch": 1.19, + "learning_rate": 2.394933621121487e-06, + "loss": 1.83, + "step": 25100 + }, + { + "epoch": 1.19, + "learning_rate": 2.3904582061309217e-06, + "loss": 1.6753, + "step": 25200 + }, + { + "epoch": 1.2, + "learning_rate": 2.3859705171076983e-06, + "loss": 1.8203, + "step": 25300 + }, + { + "epoch": 1.2, + "learning_rate": 2.3814706159099038e-06, + "loss": 1.7362, + "step": 25400 + }, + { + "epoch": 1.21, + "learning_rate": 2.376958564563958e-06, + "loss": 1.8836, + "step": 25500 + }, + { + "epoch": 1.21, + "learning_rate": 2.372434425263757e-06, + "loss": 1.7072, + "step": 25600 + }, + { + "epoch": 1.21, + "learning_rate": 2.367898260369818e-06, + "loss": 1.6916, + "step": 25700 + }, + { + "epoch": 1.22, + "learning_rate": 2.3633501324084165e-06, + "loss": 1.6549, + "step": 25800 + }, + { + "epoch": 1.22, + "learning_rate": 2.358790104070728e-06, + "loss": 1.7526, + "step": 25900 + }, + { + "epoch": 1.23, + "learning_rate": 2.354218238211962e-06, + "loss": 1.7785, + "step": 26000 + }, + { + "epoch": 1.23, + "learning_rate": 2.349634597850495e-06, + "loss": 1.7332, + "step": 26100 + }, + { + "epoch": 1.24, + "learning_rate": 2.3450392461670026e-06, + "loss": 1.7434, + "step": 26200 + }, + { + "epoch": 1.24, + "learning_rate": 2.3404322465035903e-06, + "loss": 1.8742, + "step": 26300 + }, + { + "epoch": 1.25, + "learning_rate": 2.3358136623629167e-06, + "loss": 1.6601, + "step": 26400 + }, + { + "epoch": 1.25, + "eval_loss": 1.3507641553878784, + "eval_runtime": 162.7404, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 26450 + }, + { + "epoch": 1.25, + "learning_rate": 2.331183557407322e-06, + "loss": 1.7639, + "step": 26500 + }, + { + "epoch": 1.26, + "learning_rate": 2.3265419954579467e-06, + "loss": 1.849, + "step": 26600 + }, + { + "epoch": 1.26, + "learning_rate": 2.321889040493856e-06, + "loss": 1.9006, + "step": 26700 + }, + { + "epoch": 1.27, + "learning_rate": 2.317224756651156e-06, + "loss": 1.6524, + "step": 26800 + }, + { + "epoch": 1.27, + "learning_rate": 2.3125492082221074e-06, + "loss": 1.8237, + "step": 26900 + }, + { + "epoch": 1.28, + "learning_rate": 2.307862459654243e-06, + "loss": 1.7348, + "step": 27000 + }, + { + "epoch": 1.28, + "learning_rate": 2.303164575549478e-06, + "loss": 1.6887, + "step": 27100 + }, + { + "epoch": 1.29, + "learning_rate": 2.298455620663217e-06, + "loss": 1.7558, + "step": 27200 + }, + { + "epoch": 1.29, + "learning_rate": 2.293735659903468e-06, + "loss": 1.8181, + "step": 27300 + }, + { + "epoch": 1.3, + "learning_rate": 2.2890047583299385e-06, + "loss": 1.7344, + "step": 27400 + }, + { + "epoch": 1.3, + "learning_rate": 2.284262981153147e-06, + "loss": 1.8456, + "step": 27500 + }, + { + "epoch": 1.3, + "learning_rate": 2.27951039373352e-06, + "loss": 1.711, + "step": 27600 + }, + { + "epoch": 1.31, + "learning_rate": 2.2747470615804907e-06, + "loss": 1.7673, + "step": 27700 + }, + { + "epoch": 1.31, + "learning_rate": 2.269973050351599e-06, + "loss": 1.7957, + "step": 27800 + }, + { + "epoch": 1.32, + "learning_rate": 2.265188425851583e-06, + "loss": 1.6838, + "step": 27900 + }, + { + "epoch": 1.32, + "learning_rate": 2.260393254031475e-06, + "loss": 1.6342, + "step": 28000 + }, + { + "epoch": 1.33, + "learning_rate": 2.2555876009876904e-06, + "loss": 1.8296, + "step": 28100 + }, + { + "epoch": 1.33, + "learning_rate": 2.250771532961118e-06, + "loss": 1.7831, + "step": 28200 + }, + { + "epoch": 1.34, + "learning_rate": 2.2459451163362036e-06, + "loss": 1.7551, + "step": 28300 + }, + { + "epoch": 1.34, + "learning_rate": 2.241108417640041e-06, + "loss": 1.708, + "step": 28400 + }, + { + "epoch": 1.35, + "learning_rate": 2.2362615035414496e-06, + "loss": 1.7695, + "step": 28500 + }, + { + "epoch": 1.35, + "learning_rate": 2.231404440850058e-06, + "loss": 1.6231, + "step": 28600 + }, + { + "epoch": 1.36, + "learning_rate": 2.2265372965153827e-06, + "loss": 1.7269, + "step": 28700 + }, + { + "epoch": 1.36, + "learning_rate": 2.2216601376259044e-06, + "loss": 1.6641, + "step": 28800 + }, + { + "epoch": 1.37, + "learning_rate": 2.2167730314081447e-06, + "loss": 1.7724, + "step": 28900 + }, + { + "epoch": 1.37, + "learning_rate": 2.211876045225738e-06, + "loss": 1.909, + "step": 29000 + }, + { + "epoch": 1.38, + "learning_rate": 2.2069692465785034e-06, + "loss": 1.7163, + "step": 29100 + }, + { + "epoch": 1.38, + "learning_rate": 2.202052703101516e-06, + "loss": 1.857, + "step": 29200 + }, + { + "epoch": 1.38, + "learning_rate": 2.1971264825641716e-06, + "loss": 1.6806, + "step": 29300 + }, + { + "epoch": 1.39, + "learning_rate": 2.1921906528692556e-06, + "loss": 1.7828, + "step": 29400 + }, + { + "epoch": 1.39, + "learning_rate": 2.187245282052004e-06, + "loss": 1.7669, + "step": 29500 + }, + { + "epoch": 1.4, + "learning_rate": 2.1822904382791686e-06, + "loss": 1.7001, + "step": 29600 + }, + { + "epoch": 1.4, + "learning_rate": 2.1773261898480747e-06, + "loss": 1.6504, + "step": 29700 + }, + { + "epoch": 1.41, + "learning_rate": 2.172352605185682e-06, + "loss": 1.6888, + "step": 29800 + }, + { + "epoch": 1.41, + "learning_rate": 2.167369752847639e-06, + "loss": 1.6804, + "step": 29900 + }, + { + "epoch": 1.42, + "learning_rate": 2.162377701517341e-06, + "loss": 1.5615, + "step": 30000 + }, + { + "epoch": 1.42, + "learning_rate": 2.1573765200049817e-06, + "loss": 1.6089, + "step": 30100 + }, + { + "epoch": 1.43, + "learning_rate": 2.1523662772466025e-06, + "loss": 1.7575, + "step": 30200 + }, + { + "epoch": 1.43, + "learning_rate": 2.1473470423031475e-06, + "loss": 1.6443, + "step": 30300 + }, + { + "epoch": 1.44, + "learning_rate": 2.1423188843595067e-06, + "loss": 1.6201, + "step": 30400 + }, + { + "epoch": 1.44, + "learning_rate": 2.1372818727235653e-06, + "loss": 1.7594, + "step": 30500 + }, + { + "epoch": 1.45, + "learning_rate": 2.132236076825247e-06, + "loss": 1.6505, + "step": 30600 + }, + { + "epoch": 1.45, + "learning_rate": 2.127181566215557e-06, + "loss": 1.8139, + "step": 30700 + }, + { + "epoch": 1.46, + "learning_rate": 2.122118410565624e-06, + "loss": 1.738, + "step": 30800 + }, + { + "epoch": 1.46, + "learning_rate": 2.11704667966574e-06, + "loss": 1.693, + "step": 30900 + }, + { + "epoch": 1.47, + "learning_rate": 2.111966443424397e-06, + "loss": 1.8003, + "step": 31000 + }, + { + "epoch": 1.47, + "learning_rate": 2.1068777718673254e-06, + "loss": 1.8407, + "step": 31100 + }, + { + "epoch": 1.47, + "learning_rate": 2.101780735136526e-06, + "loss": 1.5816, + "step": 31200 + }, + { + "epoch": 1.48, + "learning_rate": 2.0966754034893047e-06, + "loss": 1.6609, + "step": 31300 + }, + { + "epoch": 1.48, + "learning_rate": 2.0915618472973062e-06, + "loss": 1.7292, + "step": 31400 + }, + { + "epoch": 1.49, + "learning_rate": 2.0864401370455406e-06, + "loss": 1.7347, + "step": 31500 + }, + { + "epoch": 1.49, + "learning_rate": 2.081310343331413e-06, + "loss": 1.748, + "step": 31600 + }, + { + "epoch": 1.5, + "learning_rate": 2.0761725368637496e-06, + "loss": 1.5452, + "step": 31700 + }, + { + "epoch": 1.5, + "eval_loss": 1.3357341289520264, + "eval_runtime": 162.3538, + "eval_samples_per_second": 5.722, + "eval_steps_per_second": 5.722, + "step": 31740 + }, + { + "epoch": 1.5, + "learning_rate": 2.0710267884618273e-06, + "loss": 1.6686, + "step": 31800 + }, + { + "epoch": 1.51, + "learning_rate": 2.0658731690543905e-06, + "loss": 1.72, + "step": 31900 + }, + { + "epoch": 1.51, + "learning_rate": 2.0607117496786794e-06, + "loss": 1.7252, + "step": 32000 + }, + { + "epoch": 1.52, + "learning_rate": 2.0555426014794477e-06, + "loss": 1.6562, + "step": 32100 + }, + { + "epoch": 1.52, + "learning_rate": 2.050365795707983e-06, + "loss": 1.6878, + "step": 32200 + }, + { + "epoch": 1.53, + "learning_rate": 2.0451814037211256e-06, + "loss": 1.7308, + "step": 32300 + }, + { + "epoch": 1.53, + "learning_rate": 2.0399894969802814e-06, + "loss": 1.6544, + "step": 32400 + }, + { + "epoch": 1.54, + "learning_rate": 2.034790147050442e-06, + "loss": 1.7115, + "step": 32500 + }, + { + "epoch": 1.54, + "learning_rate": 2.0295834255991927e-06, + "loss": 1.8076, + "step": 32600 + }, + { + "epoch": 1.55, + "learning_rate": 2.024369404395731e-06, + "loss": 1.6923, + "step": 32700 + }, + { + "epoch": 1.55, + "learning_rate": 2.01914815530987e-06, + "loss": 1.8198, + "step": 32800 + }, + { + "epoch": 1.56, + "learning_rate": 2.013919750311055e-06, + "loss": 1.5914, + "step": 32900 + }, + { + "epoch": 1.56, + "learning_rate": 2.008684261467365e-06, + "loss": 1.7334, + "step": 33000 + }, + { + "epoch": 1.56, + "learning_rate": 2.003441760944525e-06, + "loss": 1.6914, + "step": 33100 + }, + { + "epoch": 1.57, + "learning_rate": 1.998192321004908e-06, + "loss": 1.5967, + "step": 33200 + }, + { + "epoch": 1.57, + "learning_rate": 1.992936014006538e-06, + "loss": 1.6271, + "step": 33300 + }, + { + "epoch": 1.58, + "learning_rate": 1.9876729124020963e-06, + "loss": 1.5439, + "step": 33400 + }, + { + "epoch": 1.58, + "learning_rate": 1.982403088737918e-06, + "loss": 1.5242, + "step": 33500 + }, + { + "epoch": 1.59, + "learning_rate": 1.977126615652999e-06, + "loss": 1.7863, + "step": 33600 + }, + { + "epoch": 1.59, + "learning_rate": 1.9718435658779864e-06, + "loss": 1.7852, + "step": 33700 + }, + { + "epoch": 1.6, + "learning_rate": 1.9665540122341817e-06, + "loss": 1.7474, + "step": 33800 + }, + { + "epoch": 1.6, + "learning_rate": 1.9612580276325363e-06, + "loss": 1.818, + "step": 33900 + }, + { + "epoch": 1.61, + "learning_rate": 1.9559556850726433e-06, + "loss": 1.8187, + "step": 34000 + }, + { + "epoch": 1.61, + "learning_rate": 1.9506470576417362e-06, + "loss": 1.6308, + "step": 34100 + }, + { + "epoch": 1.62, + "learning_rate": 1.9453322185136772e-06, + "loss": 1.5877, + "step": 34200 + }, + { + "epoch": 1.62, + "learning_rate": 1.9400112409479507e-06, + "loss": 1.5775, + "step": 34300 + }, + { + "epoch": 1.63, + "learning_rate": 1.9346841982886527e-06, + "loss": 1.6369, + "step": 34400 + }, + { + "epoch": 1.63, + "learning_rate": 1.929351163963481e-06, + "loss": 1.7436, + "step": 34500 + }, + { + "epoch": 1.64, + "learning_rate": 1.924012211482721e-06, + "loss": 1.7817, + "step": 34600 + }, + { + "epoch": 1.64, + "learning_rate": 1.918667414438235e-06, + "loss": 1.7958, + "step": 34700 + }, + { + "epoch": 1.64, + "learning_rate": 1.9133168465024454e-06, + "loss": 1.6632, + "step": 34800 + }, + { + "epoch": 1.65, + "learning_rate": 1.907960581427321e-06, + "loss": 1.7518, + "step": 34900 + }, + { + "epoch": 1.65, + "learning_rate": 1.9025986930433594e-06, + "loss": 1.7184, + "step": 35000 + }, + { + "epoch": 1.66, + "learning_rate": 1.8972312552585695e-06, + "loss": 1.6154, + "step": 35100 + }, + { + "epoch": 1.66, + "learning_rate": 1.891858342057453e-06, + "loss": 1.7069, + "step": 35200 + }, + { + "epoch": 1.67, + "learning_rate": 1.8864800274999842e-06, + "loss": 1.6902, + "step": 35300 + }, + { + "epoch": 1.67, + "learning_rate": 1.8810963857205902e-06, + "loss": 1.6736, + "step": 35400 + }, + { + "epoch": 1.68, + "learning_rate": 1.8757074909271275e-06, + "loss": 1.7893, + "step": 35500 + }, + { + "epoch": 1.68, + "learning_rate": 1.8703134173998603e-06, + "loss": 1.7374, + "step": 35600 + }, + { + "epoch": 1.69, + "learning_rate": 1.864914239490436e-06, + "loss": 1.7173, + "step": 35700 + }, + { + "epoch": 1.69, + "learning_rate": 1.8595100316208608e-06, + "loss": 1.6844, + "step": 35800 + }, + { + "epoch": 1.7, + "learning_rate": 1.854100868282473e-06, + "loss": 1.6794, + "step": 35900 + }, + { + "epoch": 1.7, + "learning_rate": 1.8486868240349173e-06, + "loss": 1.65, + "step": 36000 + }, + { + "epoch": 1.71, + "learning_rate": 1.8432679735051177e-06, + "loss": 1.6641, + "step": 36100 + }, + { + "epoch": 1.71, + "learning_rate": 1.8378443913862453e-06, + "loss": 1.6942, + "step": 36200 + }, + { + "epoch": 1.72, + "learning_rate": 1.8324161524366935e-06, + "loss": 1.782, + "step": 36300 + }, + { + "epoch": 1.72, + "learning_rate": 1.8269833314790437e-06, + "loss": 1.5728, + "step": 36400 + }, + { + "epoch": 1.73, + "learning_rate": 1.8215460033990368e-06, + "loss": 1.6751, + "step": 36500 + }, + { + "epoch": 1.73, + "learning_rate": 1.8161042431445376e-06, + "loss": 1.5691, + "step": 36600 + }, + { + "epoch": 1.73, + "learning_rate": 1.8106581257245064e-06, + "loss": 1.7601, + "step": 36700 + }, + { + "epoch": 1.74, + "learning_rate": 1.8052077262079612e-06, + "loss": 1.6157, + "step": 36800 + }, + { + "epoch": 1.74, + "learning_rate": 1.799753119722943e-06, + "loss": 1.7615, + "step": 36900 + }, + { + "epoch": 1.75, + "learning_rate": 1.7942943814554837e-06, + "loss": 1.7381, + "step": 37000 + }, + { + "epoch": 1.75, + "eval_loss": 1.319101095199585, + "eval_runtime": 162.3139, + "eval_samples_per_second": 5.723, + "eval_steps_per_second": 5.723, + "step": 37030 + }, + { + "epoch": 1.75, + "learning_rate": 1.7888315866485659e-06, + "loss": 1.7177, + "step": 37100 + }, + { + "epoch": 1.76, + "learning_rate": 1.7833648106010884e-06, + "loss": 1.7527, + "step": 37200 + }, + { + "epoch": 1.76, + "learning_rate": 1.7778941286668257e-06, + "loss": 1.6938, + "step": 37300 + }, + { + "epoch": 1.77, + "learning_rate": 1.772419616253393e-06, + "loss": 1.7706, + "step": 37400 + }, + { + "epoch": 1.77, + "learning_rate": 1.7669413488212027e-06, + "loss": 1.6078, + "step": 37500 + }, + { + "epoch": 1.78, + "learning_rate": 1.761459401882427e-06, + "loss": 1.6867, + "step": 37600 + }, + { + "epoch": 1.78, + "learning_rate": 1.755973850999957e-06, + "loss": 1.6677, + "step": 37700 + }, + { + "epoch": 1.79, + "learning_rate": 1.750484771786358e-06, + "loss": 1.6582, + "step": 37800 + }, + { + "epoch": 1.79, + "learning_rate": 1.7449922399028333e-06, + "loss": 1.6047, + "step": 37900 + }, + { + "epoch": 1.8, + "learning_rate": 1.7394963310581735e-06, + "loss": 1.8746, + "step": 38000 + }, + { + "epoch": 1.8, + "learning_rate": 1.733997121007721e-06, + "loss": 1.549, + "step": 38100 + }, + { + "epoch": 1.81, + "learning_rate": 1.7284946855523186e-06, + "loss": 1.7323, + "step": 38200 + }, + { + "epoch": 1.81, + "learning_rate": 1.7229891005372704e-06, + "loss": 1.734, + "step": 38300 + }, + { + "epoch": 1.82, + "learning_rate": 1.7174804418512918e-06, + "loss": 1.6329, + "step": 38400 + }, + { + "epoch": 1.82, + "learning_rate": 1.7119687854254674e-06, + "loss": 1.5707, + "step": 38500 + }, + { + "epoch": 1.82, + "learning_rate": 1.7064542072322015e-06, + "loss": 1.7011, + "step": 38600 + }, + { + "epoch": 1.83, + "learning_rate": 1.7009367832841715e-06, + "loss": 1.6164, + "step": 38700 + }, + { + "epoch": 1.83, + "learning_rate": 1.6954165896332817e-06, + "loss": 1.6312, + "step": 38800 + }, + { + "epoch": 1.84, + "learning_rate": 1.6898937023696123e-06, + "loss": 1.7649, + "step": 38900 + }, + { + "epoch": 1.84, + "learning_rate": 1.6843681976203744e-06, + "loss": 1.6634, + "step": 39000 + }, + { + "epoch": 1.85, + "learning_rate": 1.6788401515488557e-06, + "loss": 1.6431, + "step": 39100 + }, + { + "epoch": 1.85, + "learning_rate": 1.673309640353376e-06, + "loss": 1.7147, + "step": 39200 + }, + { + "epoch": 1.86, + "learning_rate": 1.6677767402662318e-06, + "loss": 1.881, + "step": 39300 + }, + { + "epoch": 1.86, + "learning_rate": 1.6622415275526502e-06, + "loss": 1.6384, + "step": 39400 + }, + { + "epoch": 1.87, + "learning_rate": 1.6567040785097333e-06, + "loss": 1.6662, + "step": 39500 + }, + { + "epoch": 1.87, + "learning_rate": 1.6511644694654109e-06, + "loss": 1.6323, + "step": 39600 + }, + { + "epoch": 1.88, + "learning_rate": 1.6456227767773842e-06, + "loss": 1.7642, + "step": 39700 + }, + { + "epoch": 1.88, + "learning_rate": 1.6400790768320761e-06, + "loss": 1.6971, + "step": 39800 + }, + { + "epoch": 1.89, + "learning_rate": 1.6345334460435775e-06, + "loss": 1.7224, + "step": 39900 + }, + { + "epoch": 1.89, + "learning_rate": 1.6289859608525936e-06, + "loss": 1.7847, + "step": 40000 + }, + { + "epoch": 1.9, + "learning_rate": 1.623436697725391e-06, + "loss": 1.6998, + "step": 40100 + }, + { + "epoch": 1.9, + "learning_rate": 1.6178857331527427e-06, + "loss": 1.7637, + "step": 40200 + }, + { + "epoch": 1.9, + "learning_rate": 1.6123331436488752e-06, + "loss": 1.738, + "step": 40300 + }, + { + "epoch": 1.91, + "learning_rate": 1.6067790057504125e-06, + "loss": 1.8809, + "step": 40400 + }, + { + "epoch": 1.91, + "learning_rate": 1.6012233960153213e-06, + "loss": 1.6865, + "step": 40500 + }, + { + "epoch": 1.92, + "learning_rate": 1.5956663910218566e-06, + "loss": 1.7502, + "step": 40600 + }, + { + "epoch": 1.92, + "learning_rate": 1.590108067367505e-06, + "loss": 1.7131, + "step": 40700 + }, + { + "epoch": 1.93, + "learning_rate": 1.58454850166793e-06, + "loss": 1.6668, + "step": 40800 + }, + { + "epoch": 1.93, + "learning_rate": 1.5789877705559149e-06, + "loss": 1.6616, + "step": 40900 + }, + { + "epoch": 1.94, + "learning_rate": 1.573425950680308e-06, + "loss": 1.8484, + "step": 41000 + }, + { + "epoch": 1.94, + "learning_rate": 1.567863118704963e-06, + "loss": 1.722, + "step": 41100 + }, + { + "epoch": 1.95, + "learning_rate": 1.562299351307686e-06, + "loss": 1.6145, + "step": 41200 + }, + { + "epoch": 1.95, + "learning_rate": 1.5567347251791773e-06, + "loss": 1.744, + "step": 41300 + }, + { + "epoch": 1.96, + "learning_rate": 1.5511693170219723e-06, + "loss": 1.7476, + "step": 41400 + }, + { + "epoch": 1.96, + "learning_rate": 1.5456032035493878e-06, + "loss": 1.6705, + "step": 41500 + }, + { + "epoch": 1.97, + "learning_rate": 1.5400364614844604e-06, + "loss": 1.5381, + "step": 41600 + }, + { + "epoch": 1.97, + "learning_rate": 1.5344691675588926e-06, + "loss": 1.7072, + "step": 41700 + }, + { + "epoch": 1.98, + "learning_rate": 1.5289013985119934e-06, + "loss": 1.7217, + "step": 41800 + }, + { + "epoch": 1.98, + "learning_rate": 1.5233332310896214e-06, + "loss": 1.6447, + "step": 41900 + }, + { + "epoch": 1.99, + "learning_rate": 1.5177647420431253e-06, + "loss": 1.6961, + "step": 42000 + }, + { + "epoch": 1.99, + "learning_rate": 1.5121960081282878e-06, + "loss": 1.8037, + "step": 42100 + }, + { + "epoch": 1.99, + "learning_rate": 1.5066271061042672e-06, + "loss": 1.6076, + "step": 42200 + }, + { + "epoch": 2.0, + "learning_rate": 1.5010581127325374e-06, + "loss": 1.6256, + "step": 42300 + }, + { + "epoch": 2.0, + "eval_loss": 1.309001088142395, + "eval_runtime": 163.9053, + "eval_samples_per_second": 5.668, + "eval_steps_per_second": 5.668, + "step": 42320 + }, + { + "epoch": 2.0, + "learning_rate": 1.4954891047758328e-06, + "loss": 1.6049, + "step": 42400 + }, + { + "epoch": 2.01, + "learning_rate": 1.489920158997089e-06, + "loss": 1.5866, + "step": 42500 + }, + { + "epoch": 2.01, + "learning_rate": 1.4843513521583844e-06, + "loss": 1.6174, + "step": 42600 + }, + { + "epoch": 2.02, + "learning_rate": 1.4787827610198813e-06, + "loss": 1.711, + "step": 42700 + }, + { + "epoch": 2.02, + "learning_rate": 1.4732144623387696e-06, + "loss": 1.6283, + "step": 42800 + }, + { + "epoch": 2.03, + "learning_rate": 1.4676465328682085e-06, + "loss": 1.7035, + "step": 42900 + }, + { + "epoch": 2.03, + "learning_rate": 1.4620790493562662e-06, + "loss": 1.6869, + "step": 43000 + }, + { + "epoch": 2.04, + "learning_rate": 1.4565120885448656e-06, + "loss": 1.6827, + "step": 43100 + }, + { + "epoch": 2.04, + "learning_rate": 1.4509457271687238e-06, + "loss": 1.7237, + "step": 43200 + }, + { + "epoch": 2.05, + "learning_rate": 1.4453800419542962e-06, + "loss": 1.6418, + "step": 43300 + }, + { + "epoch": 2.05, + "learning_rate": 1.4398151096187167e-06, + "loss": 1.7514, + "step": 43400 + }, + { + "epoch": 2.06, + "learning_rate": 1.434251006868743e-06, + "loss": 1.7102, + "step": 43500 + }, + { + "epoch": 2.06, + "learning_rate": 1.4286878103996967e-06, + "loss": 1.6147, + "step": 43600 + }, + { + "epoch": 2.07, + "learning_rate": 1.4231255968944078e-06, + "loss": 1.557, + "step": 43700 + }, + { + "epoch": 2.07, + "learning_rate": 1.4175644430221568e-06, + "loss": 1.6971, + "step": 43800 + }, + { + "epoch": 2.07, + "learning_rate": 1.412004425437619e-06, + "loss": 1.6645, + "step": 43900 + }, + { + "epoch": 2.08, + "learning_rate": 1.4064456207798066e-06, + "loss": 1.688, + "step": 44000 + }, + { + "epoch": 2.08, + "learning_rate": 1.4008881056710125e-06, + "loss": 1.7062, + "step": 44100 + }, + { + "epoch": 2.09, + "learning_rate": 1.3953319567157556e-06, + "loss": 1.5745, + "step": 44200 + }, + { + "epoch": 2.09, + "learning_rate": 1.3897772504997228e-06, + "loss": 1.5922, + "step": 44300 + }, + { + "epoch": 2.1, + "learning_rate": 1.3842240635887154e-06, + "loss": 1.7366, + "step": 44400 + }, + { + "epoch": 2.1, + "learning_rate": 1.3786724725275911e-06, + "loss": 1.7974, + "step": 44500 + }, + { + "epoch": 2.11, + "learning_rate": 1.3731225538392125e-06, + "loss": 1.7394, + "step": 44600 + }, + { + "epoch": 2.11, + "learning_rate": 1.367574384023388e-06, + "loss": 1.7766, + "step": 44700 + }, + { + "epoch": 2.12, + "learning_rate": 1.3620280395558218e-06, + "loss": 1.631, + "step": 44800 + }, + { + "epoch": 2.12, + "learning_rate": 1.3564835968870557e-06, + "loss": 1.6251, + "step": 44900 + }, + { + "epoch": 2.13, + "learning_rate": 1.3509411324414191e-06, + "loss": 1.6983, + "step": 45000 + }, + { + "epoch": 2.13, + "learning_rate": 1.345400722615972e-06, + "loss": 1.6382, + "step": 45100 + }, + { + "epoch": 2.14, + "learning_rate": 1.3398624437794549e-06, + "loss": 1.6588, + "step": 45200 + }, + { + "epoch": 2.14, + "learning_rate": 1.3343263722712342e-06, + "loss": 1.8123, + "step": 45300 + }, + { + "epoch": 2.15, + "learning_rate": 1.3287925844002496e-06, + "loss": 1.6796, + "step": 45400 + }, + { + "epoch": 2.15, + "learning_rate": 1.3232611564439656e-06, + "loss": 1.5431, + "step": 45500 + }, + { + "epoch": 2.16, + "learning_rate": 1.3177321646473154e-06, + "loss": 1.57, + "step": 45600 + }, + { + "epoch": 2.16, + "learning_rate": 1.3122056852216538e-06, + "loss": 1.6356, + "step": 45700 + }, + { + "epoch": 2.16, + "learning_rate": 1.3066817943437054e-06, + "loss": 1.6333, + "step": 45800 + }, + { + "epoch": 2.17, + "learning_rate": 1.3011605681545126e-06, + "loss": 1.595, + "step": 45900 + }, + { + "epoch": 2.17, + "learning_rate": 1.29564208275839e-06, + "loss": 1.5615, + "step": 46000 + }, + { + "epoch": 2.18, + "learning_rate": 1.2901264142218712e-06, + "loss": 1.7929, + "step": 46100 + }, + { + "epoch": 2.18, + "learning_rate": 1.2846136385726644e-06, + "loss": 1.8091, + "step": 46200 + }, + { + "epoch": 2.19, + "learning_rate": 1.2791038317986009e-06, + "loss": 1.6715, + "step": 46300 + }, + { + "epoch": 2.19, + "learning_rate": 1.2735970698465896e-06, + "loss": 1.6615, + "step": 46400 + }, + { + "epoch": 2.2, + "learning_rate": 1.2680934286215696e-06, + "loss": 1.6615, + "step": 46500 + }, + { + "epoch": 2.2, + "learning_rate": 1.2625929839854644e-06, + "loss": 1.7039, + "step": 46600 + }, + { + "epoch": 2.21, + "learning_rate": 1.2570958117561357e-06, + "loss": 1.7209, + "step": 46700 + }, + { + "epoch": 2.21, + "learning_rate": 1.2516019877063388e-06, + "loss": 1.7251, + "step": 46800 + }, + { + "epoch": 2.22, + "learning_rate": 1.2461115875626768e-06, + "loss": 1.7202, + "step": 46900 + }, + { + "epoch": 2.22, + "learning_rate": 1.2406246870045588e-06, + "loss": 1.7948, + "step": 47000 + }, + { + "epoch": 2.23, + "learning_rate": 1.2351413616631561e-06, + "loss": 1.6631, + "step": 47100 + }, + { + "epoch": 2.23, + "learning_rate": 1.2296616871203584e-06, + "loss": 1.6321, + "step": 47200 + }, + { + "epoch": 2.24, + "learning_rate": 1.2241857389077332e-06, + "loss": 1.7737, + "step": 47300 + }, + { + "epoch": 2.24, + "learning_rate": 1.2187135925054852e-06, + "loss": 1.5694, + "step": 47400 + }, + { + "epoch": 2.25, + "learning_rate": 1.2132453233414145e-06, + "loss": 1.7562, + "step": 47500 + }, + { + "epoch": 2.25, + "learning_rate": 1.207781006789877e-06, + "loss": 1.5521, + "step": 47600 + }, + { + "epoch": 2.25, + "eval_loss": 1.2960591316223145, + "eval_runtime": 158.903, + "eval_samples_per_second": 5.846, + "eval_steps_per_second": 5.846, + "step": 47610 + }, + { + "epoch": 2.25, + "learning_rate": 1.202320718170748e-06, + "loss": 1.6698, + "step": 47700 + }, + { + "epoch": 2.26, + "learning_rate": 1.1968645327483792e-06, + "loss": 1.5465, + "step": 47800 + }, + { + "epoch": 2.26, + "learning_rate": 1.1914125257305654e-06, + "loss": 1.6406, + "step": 47900 + }, + { + "epoch": 2.27, + "learning_rate": 1.1859647722675075e-06, + "loss": 1.6434, + "step": 48000 + }, + { + "epoch": 2.27, + "learning_rate": 1.1805213474507738e-06, + "loss": 1.5834, + "step": 48100 + }, + { + "epoch": 2.28, + "learning_rate": 1.1750823263122683e-06, + "loss": 1.683, + "step": 48200 + }, + { + "epoch": 2.28, + "learning_rate": 1.169647783823193e-06, + "loss": 1.5975, + "step": 48300 + }, + { + "epoch": 2.29, + "learning_rate": 1.1642177948930188e-06, + "loss": 1.6729, + "step": 48400 + }, + { + "epoch": 2.29, + "learning_rate": 1.1587924343684486e-06, + "loss": 1.688, + "step": 48500 + }, + { + "epoch": 2.3, + "learning_rate": 1.1533717770323887e-06, + "loss": 1.6362, + "step": 48600 + }, + { + "epoch": 2.3, + "learning_rate": 1.1479558976029164e-06, + "loss": 1.7004, + "step": 48700 + }, + { + "epoch": 2.31, + "learning_rate": 1.1425448707322505e-06, + "loss": 1.6087, + "step": 48800 + }, + { + "epoch": 2.31, + "learning_rate": 1.137138771005723e-06, + "loss": 1.6815, + "step": 48900 + }, + { + "epoch": 2.32, + "learning_rate": 1.1317376729407493e-06, + "loss": 1.5914, + "step": 49000 + }, + { + "epoch": 2.32, + "learning_rate": 1.1263416509858032e-06, + "loss": 1.5619, + "step": 49100 + }, + { + "epoch": 2.33, + "learning_rate": 1.1209507795193888e-06, + "loss": 1.6197, + "step": 49200 + }, + { + "epoch": 2.33, + "learning_rate": 1.1155651328490174e-06, + "loss": 1.6824, + "step": 49300 + }, + { + "epoch": 2.33, + "learning_rate": 1.11018478521018e-06, + "loss": 1.7277, + "step": 49400 + }, + { + "epoch": 2.34, + "learning_rate": 1.1048098107653282e-06, + "loss": 1.6273, + "step": 49500 + }, + { + "epoch": 2.34, + "learning_rate": 1.0994402836028472e-06, + "loss": 1.6803, + "step": 49600 + }, + { + "epoch": 2.35, + "learning_rate": 1.0940762777360401e-06, + "loss": 1.5929, + "step": 49700 + }, + { + "epoch": 2.35, + "learning_rate": 1.0887178671021024e-06, + "loss": 1.6484, + "step": 49800 + }, + { + "epoch": 2.36, + "learning_rate": 1.0833651255611058e-06, + "loss": 1.7423, + "step": 49900 + }, + { + "epoch": 2.36, + "learning_rate": 1.0780181268949805e-06, + "loss": 1.6847, + "step": 50000 + }, + { + "epoch": 2.37, + "learning_rate": 1.0726769448064956e-06, + "loss": 1.6074, + "step": 50100 + }, + { + "epoch": 2.37, + "learning_rate": 1.0673416529182462e-06, + "loss": 1.7478, + "step": 50200 + }, + { + "epoch": 2.38, + "learning_rate": 1.0620123247716362e-06, + "loss": 1.7042, + "step": 50300 + }, + { + "epoch": 2.38, + "learning_rate": 1.0566890338258655e-06, + "loss": 1.6337, + "step": 50400 + }, + { + "epoch": 2.39, + "learning_rate": 1.0513718534569187e-06, + "loss": 1.7174, + "step": 50500 + }, + { + "epoch": 2.39, + "learning_rate": 1.0460608569565506e-06, + "loss": 1.6805, + "step": 50600 + }, + { + "epoch": 2.4, + "learning_rate": 1.0407561175312802e-06, + "loss": 1.5872, + "step": 50700 + }, + { + "epoch": 2.4, + "learning_rate": 1.035457708301377e-06, + "loss": 1.7103, + "step": 50800 + }, + { + "epoch": 2.41, + "learning_rate": 1.0301657022998575e-06, + "loss": 1.7544, + "step": 50900 + }, + { + "epoch": 2.41, + "learning_rate": 1.0248801724714746e-06, + "loss": 1.6165, + "step": 51000 + }, + { + "epoch": 2.42, + "learning_rate": 1.019601191671715e-06, + "loss": 1.5813, + "step": 51100 + }, + { + "epoch": 2.42, + "learning_rate": 1.0143288326657935e-06, + "loss": 1.6332, + "step": 51200 + }, + { + "epoch": 2.42, + "learning_rate": 1.0090631681276508e-06, + "loss": 1.7332, + "step": 51300 + }, + { + "epoch": 2.43, + "learning_rate": 1.0038042706389505e-06, + "loss": 1.5387, + "step": 51400 + }, + { + "epoch": 2.43, + "learning_rate": 9.985522126880806e-07, + "loss": 1.5534, + "step": 51500 + }, + { + "epoch": 2.44, + "learning_rate": 9.93307066669153e-07, + "loss": 1.6457, + "step": 51600 + }, + { + "epoch": 2.44, + "learning_rate": 9.880689048810049e-07, + "loss": 1.6818, + "step": 51700 + }, + { + "epoch": 2.45, + "learning_rate": 9.828377995262048e-07, + "loss": 1.5609, + "step": 51800 + }, + { + "epoch": 2.45, + "learning_rate": 9.77613822710054e-07, + "loss": 1.7747, + "step": 51900 + }, + { + "epoch": 2.46, + "learning_rate": 9.72397046439596e-07, + "loss": 1.7221, + "step": 52000 + }, + { + "epoch": 2.46, + "learning_rate": 9.671875426226204e-07, + "loss": 1.7983, + "step": 52100 + }, + { + "epoch": 2.47, + "learning_rate": 9.61985383066676e-07, + "loss": 1.6314, + "step": 52200 + }, + { + "epoch": 2.47, + "learning_rate": 9.567906394780763e-07, + "loss": 1.6959, + "step": 52300 + }, + { + "epoch": 2.48, + "learning_rate": 9.516033834609155e-07, + "loss": 1.6105, + "step": 52400 + }, + { + "epoch": 2.48, + "learning_rate": 9.464236865160779e-07, + "loss": 1.573, + "step": 52500 + }, + { + "epoch": 2.49, + "learning_rate": 9.412516200402556e-07, + "loss": 1.6789, + "step": 52600 + }, + { + "epoch": 2.49, + "learning_rate": 9.360872553249605e-07, + "loss": 1.7057, + "step": 52700 + }, + { + "epoch": 2.5, + "learning_rate": 9.30930663555545e-07, + "loss": 1.6102, + "step": 52800 + }, + { + "epoch": 2.5, + "learning_rate": 9.257819158102203e-07, + "loss": 1.8318, + "step": 52900 + }, + { + "epoch": 2.5, + "eval_loss": 1.2909756898880005, + "eval_runtime": 158.004, + "eval_samples_per_second": 5.88, + "eval_steps_per_second": 5.88, + "step": 52900 + }, + { + "epoch": 2.51, + "learning_rate": 9.206410830590746e-07, + "loss": 1.6514, + "step": 53000 + }, + { + "epoch": 2.51, + "learning_rate": 9.15508236163097e-07, + "loss": 1.7379, + "step": 53100 + }, + { + "epoch": 2.51, + "learning_rate": 9.103834458732002e-07, + "loss": 1.6323, + "step": 53200 + }, + { + "epoch": 2.52, + "learning_rate": 9.052667828292439e-07, + "loss": 1.8245, + "step": 53300 + }, + { + "epoch": 2.52, + "learning_rate": 9.001583175590636e-07, + "loss": 1.5375, + "step": 53400 + }, + { + "epoch": 2.53, + "learning_rate": 8.950581204774961e-07, + "loss": 1.737, + "step": 53500 + }, + { + "epoch": 2.53, + "learning_rate": 8.899662618854105e-07, + "loss": 1.6755, + "step": 53600 + }, + { + "epoch": 2.54, + "learning_rate": 8.848828119687375e-07, + "loss": 1.6737, + "step": 53700 + }, + { + "epoch": 2.54, + "learning_rate": 8.798078407975051e-07, + "loss": 1.7876, + "step": 53800 + }, + { + "epoch": 2.55, + "learning_rate": 8.747414183248682e-07, + "loss": 1.6804, + "step": 53900 + }, + { + "epoch": 2.55, + "learning_rate": 8.696836143861491e-07, + "loss": 1.5951, + "step": 54000 + }, + { + "epoch": 2.56, + "learning_rate": 8.646344986978708e-07, + "loss": 1.6206, + "step": 54100 + }, + { + "epoch": 2.56, + "learning_rate": 8.595941408567983e-07, + "loss": 1.7823, + "step": 54200 + }, + { + "epoch": 2.57, + "learning_rate": 8.545626103389805e-07, + "loss": 1.6832, + "step": 54300 + }, + { + "epoch": 2.57, + "learning_rate": 8.495399764987894e-07, + "loss": 1.6455, + "step": 54400 + }, + { + "epoch": 2.58, + "learning_rate": 8.445263085679645e-07, + "loss": 1.6894, + "step": 54500 + }, + { + "epoch": 2.58, + "learning_rate": 8.395216756546627e-07, + "loss": 1.5944, + "step": 54600 + }, + { + "epoch": 2.59, + "learning_rate": 8.345261467425003e-07, + "loss": 1.7441, + "step": 54700 + }, + { + "epoch": 2.59, + "learning_rate": 8.295397906896052e-07, + "loss": 1.7046, + "step": 54800 + }, + { + "epoch": 2.59, + "learning_rate": 8.245626762276663e-07, + "loss": 1.6335, + "step": 54900 + }, + { + "epoch": 2.6, + "learning_rate": 8.195948719609889e-07, + "loss": 1.7515, + "step": 55000 + }, + { + "epoch": 2.6, + "learning_rate": 8.146364463655458e-07, + "loss": 1.6208, + "step": 55100 + }, + { + "epoch": 2.61, + "learning_rate": 8.096874677880322e-07, + "loss": 1.6655, + "step": 55200 + }, + { + "epoch": 2.61, + "learning_rate": 8.047480044449309e-07, + "loss": 1.7218, + "step": 55300 + }, + { + "epoch": 2.62, + "learning_rate": 7.998181244215638e-07, + "loss": 1.5814, + "step": 55400 + }, + { + "epoch": 2.62, + "learning_rate": 7.948978956711576e-07, + "loss": 1.7588, + "step": 55500 + }, + { + "epoch": 2.63, + "learning_rate": 7.899873860139058e-07, + "loss": 1.6841, + "step": 55600 + }, + { + "epoch": 2.63, + "learning_rate": 7.850866631360363e-07, + "loss": 1.6321, + "step": 55700 + }, + { + "epoch": 2.64, + "learning_rate": 7.801957945888744e-07, + "loss": 1.654, + "step": 55800 + }, + { + "epoch": 2.64, + "learning_rate": 7.75314847787914e-07, + "loss": 1.6165, + "step": 55900 + }, + { + "epoch": 2.65, + "learning_rate": 7.704438900118902e-07, + "loss": 1.7136, + "step": 56000 + }, + { + "epoch": 2.65, + "learning_rate": 7.655829884018475e-07, + "loss": 1.6892, + "step": 56100 + }, + { + "epoch": 2.66, + "learning_rate": 7.607322099602175e-07, + "loss": 1.6254, + "step": 56200 + }, + { + "epoch": 2.66, + "learning_rate": 7.558916215498944e-07, + "loss": 1.5811, + "step": 56300 + }, + { + "epoch": 2.67, + "learning_rate": 7.510612898933145e-07, + "loss": 1.6081, + "step": 56400 + }, + { + "epoch": 2.67, + "learning_rate": 7.462412815715343e-07, + "loss": 1.5603, + "step": 56500 + }, + { + "epoch": 2.68, + "learning_rate": 7.414316630233144e-07, + "loss": 1.7405, + "step": 56600 + }, + { + "epoch": 2.68, + "learning_rate": 7.366325005442026e-07, + "loss": 1.6653, + "step": 56700 + }, + { + "epoch": 2.68, + "learning_rate": 7.318438602856225e-07, + "loss": 1.6596, + "step": 56800 + }, + { + "epoch": 2.69, + "learning_rate": 7.270658082539581e-07, + "loss": 1.706, + "step": 56900 + }, + { + "epoch": 2.69, + "learning_rate": 7.222984103096469e-07, + "loss": 1.718, + "step": 57000 + }, + { + "epoch": 2.7, + "learning_rate": 7.175417321662698e-07, + "loss": 1.6861, + "step": 57100 + }, + { + "epoch": 2.7, + "learning_rate": 7.127958393896484e-07, + "loss": 1.668, + "step": 57200 + }, + { + "epoch": 2.71, + "learning_rate": 7.080607973969376e-07, + "loss": 1.7527, + "step": 57300 + }, + { + "epoch": 2.71, + "learning_rate": 7.033366714557257e-07, + "loss": 1.7254, + "step": 57400 + }, + { + "epoch": 2.72, + "learning_rate": 6.986235266831368e-07, + "loss": 1.5732, + "step": 57500 + }, + { + "epoch": 2.72, + "learning_rate": 6.93921428044928e-07, + "loss": 1.6163, + "step": 57600 + }, + { + "epoch": 2.73, + "learning_rate": 6.892304403545984e-07, + "loss": 1.7492, + "step": 57700 + }, + { + "epoch": 2.73, + "learning_rate": 6.845506282724956e-07, + "loss": 1.7095, + "step": 57800 + }, + { + "epoch": 2.74, + "learning_rate": 6.798820563049212e-07, + "loss": 1.7914, + "step": 57900 + }, + { + "epoch": 2.74, + "learning_rate": 6.75224788803245e-07, + "loss": 1.6378, + "step": 58000 + }, + { + "epoch": 2.75, + "learning_rate": 6.70578889963015e-07, + "loss": 1.6761, + "step": 58100 + }, + { + "epoch": 2.75, + "eval_loss": 1.2901337146759033, + "eval_runtime": 158.1238, + "eval_samples_per_second": 5.875, + "eval_steps_per_second": 5.875, + "step": 58190 + }, + { + "epoch": 2.75, + "learning_rate": 6.659444238230763e-07, + "loss": 1.6017, + "step": 58200 + }, + { + "epoch": 2.76, + "learning_rate": 6.613214542646845e-07, + "loss": 1.5221, + "step": 58300 + }, + { + "epoch": 2.76, + "learning_rate": 6.567100450106276e-07, + "loss": 1.7276, + "step": 58400 + }, + { + "epoch": 2.77, + "learning_rate": 6.521102596243459e-07, + "loss": 1.5169, + "step": 58500 + }, + { + "epoch": 2.77, + "learning_rate": 6.475221615090591e-07, + "loss": 1.7469, + "step": 58600 + }, + { + "epoch": 2.77, + "learning_rate": 6.429458139068882e-07, + "loss": 1.646, + "step": 58700 + }, + { + "epoch": 2.78, + "learning_rate": 6.383812798979856e-07, + "loss": 1.6483, + "step": 58800 + }, + { + "epoch": 2.78, + "learning_rate": 6.338286223996673e-07, + "loss": 1.5527, + "step": 58900 + }, + { + "epoch": 2.79, + "learning_rate": 6.29287904165543e-07, + "loss": 1.6215, + "step": 59000 + }, + { + "epoch": 2.79, + "learning_rate": 6.247591877846517e-07, + "loss": 1.6239, + "step": 59100 + }, + { + "epoch": 2.8, + "learning_rate": 6.202425356805997e-07, + "loss": 1.6994, + "step": 59200 + }, + { + "epoch": 2.8, + "learning_rate": 6.157380101107016e-07, + "loss": 1.5472, + "step": 59300 + }, + { + "epoch": 2.81, + "learning_rate": 6.112456731651181e-07, + "loss": 1.589, + "step": 59400 + }, + { + "epoch": 2.81, + "learning_rate": 6.067655867660037e-07, + "loss": 1.6836, + "step": 59500 + }, + { + "epoch": 2.82, + "learning_rate": 6.022978126666509e-07, + "loss": 1.6906, + "step": 59600 + }, + { + "epoch": 2.82, + "learning_rate": 5.978424124506421e-07, + "loss": 1.7639, + "step": 59700 + }, + { + "epoch": 2.83, + "learning_rate": 5.933994475309969e-07, + "loss": 1.6307, + "step": 59800 + }, + { + "epoch": 2.83, + "learning_rate": 5.889689791493279e-07, + "loss": 1.6508, + "step": 59900 + }, + { + "epoch": 2.84, + "learning_rate": 5.84551068374996e-07, + "loss": 1.6107, + "step": 60000 + }, + { + "epoch": 2.84, + "learning_rate": 5.801457761042689e-07, + "loss": 1.6451, + "step": 60100 + }, + { + "epoch": 2.85, + "learning_rate": 5.757531630594812e-07, + "loss": 1.6345, + "step": 60200 + }, + { + "epoch": 2.85, + "learning_rate": 5.71373289788197e-07, + "loss": 1.5496, + "step": 60300 + }, + { + "epoch": 2.85, + "learning_rate": 5.670062166623781e-07, + "loss": 1.6161, + "step": 60400 + }, + { + "epoch": 2.86, + "learning_rate": 5.626520038775476e-07, + "loss": 1.618, + "step": 60500 + }, + { + "epoch": 2.86, + "learning_rate": 5.583107114519624e-07, + "loss": 1.5446, + "step": 60600 + }, + { + "epoch": 2.87, + "learning_rate": 5.539823992257877e-07, + "loss": 1.6561, + "step": 60700 + }, + { + "epoch": 2.87, + "learning_rate": 5.496671268602682e-07, + "loss": 1.7354, + "step": 60800 + }, + { + "epoch": 2.88, + "learning_rate": 5.453649538369088e-07, + "loss": 1.5153, + "step": 60900 + }, + { + "epoch": 2.88, + "learning_rate": 5.410759394566529e-07, + "loss": 1.6056, + "step": 61000 + }, + { + "epoch": 2.89, + "learning_rate": 5.368001428390672e-07, + "loss": 1.674, + "step": 61100 + }, + { + "epoch": 2.89, + "learning_rate": 5.325376229215244e-07, + "loss": 1.6993, + "step": 61200 + }, + { + "epoch": 2.9, + "learning_rate": 5.282884384583917e-07, + "loss": 1.6882, + "step": 61300 + }, + { + "epoch": 2.9, + "learning_rate": 5.240526480202211e-07, + "loss": 1.5872, + "step": 61400 + }, + { + "epoch": 2.91, + "learning_rate": 5.198303099929429e-07, + "loss": 1.6554, + "step": 61500 + }, + { + "epoch": 2.91, + "learning_rate": 5.156214825770591e-07, + "loss": 1.6168, + "step": 61600 + }, + { + "epoch": 2.92, + "learning_rate": 5.114262237868423e-07, + "loss": 1.5752, + "step": 61700 + }, + { + "epoch": 2.92, + "learning_rate": 5.072445914495355e-07, + "loss": 1.655, + "step": 61800 + }, + { + "epoch": 2.93, + "learning_rate": 5.030766432045565e-07, + "loss": 1.6429, + "step": 61900 + }, + { + "epoch": 2.93, + "learning_rate": 4.989224365027009e-07, + "loss": 1.6156, + "step": 62000 + }, + { + "epoch": 2.94, + "learning_rate": 4.947820286053518e-07, + "loss": 1.6634, + "step": 62100 + }, + { + "epoch": 2.94, + "learning_rate": 4.906554765836916e-07, + "loss": 1.7337, + "step": 62200 + }, + { + "epoch": 2.94, + "learning_rate": 4.865428373179121e-07, + "loss": 1.6085, + "step": 62300 + }, + { + "epoch": 2.95, + "learning_rate": 4.824441674964334e-07, + "loss": 1.6445, + "step": 62400 + }, + { + "epoch": 2.95, + "learning_rate": 4.783595236151211e-07, + "loss": 1.7347, + "step": 62500 + }, + { + "epoch": 2.96, + "learning_rate": 4.7428896197650816e-07, + "loss": 1.5851, + "step": 62600 + }, + { + "epoch": 2.96, + "learning_rate": 4.702325386890184e-07, + "loss": 1.6059, + "step": 62700 + }, + { + "epoch": 2.97, + "learning_rate": 4.661903096661929e-07, + "loss": 1.6562, + "step": 62800 + }, + { + "epoch": 2.97, + "learning_rate": 4.6216233062592107e-07, + "loss": 1.6983, + "step": 62900 + }, + { + "epoch": 2.98, + "learning_rate": 4.581486570896701e-07, + "loss": 1.7001, + "step": 63000 + }, + { + "epoch": 2.98, + "learning_rate": 4.541493443817206e-07, + "loss": 1.5994, + "step": 63100 + }, + { + "epoch": 2.99, + "learning_rate": 4.501644476284045e-07, + "loss": 1.6582, + "step": 63200 + }, + { + "epoch": 2.99, + "learning_rate": 4.4619402175734606e-07, + "loss": 1.7147, + "step": 63300 + }, + { + "epoch": 3.0, + "learning_rate": 4.4223812149670195e-07, + "loss": 1.6312, + "step": 63400 + }, + { + "epoch": 3.0, + "eval_loss": 1.287863850593567, + "eval_runtime": 157.981, + "eval_samples_per_second": 5.88, + "eval_steps_per_second": 5.88, + "step": 63480 + }, + { + "epoch": 3.0, + "learning_rate": 4.3829680137440883e-07, + "loss": 1.5559, + "step": 63500 + }, + { + "epoch": 3.01, + "learning_rate": 4.343701157174329e-07, + "loss": 1.6739, + "step": 63600 + }, + { + "epoch": 3.01, + "learning_rate": 4.3045811865101767e-07, + "loss": 1.4717, + "step": 63700 + }, + { + "epoch": 3.02, + "learning_rate": 4.265608640979411e-07, + "loss": 1.6384, + "step": 63800 + }, + { + "epoch": 3.02, + "learning_rate": 4.226784057777699e-07, + "loss": 1.5138, + "step": 63900 + }, + { + "epoch": 3.03, + "learning_rate": 4.1881079720612204e-07, + "loss": 1.5968, + "step": 64000 + }, + { + "epoch": 3.03, + "learning_rate": 4.149580916939255e-07, + "loss": 1.5826, + "step": 64100 + }, + { + "epoch": 3.03, + "learning_rate": 4.1112034234668615e-07, + "loss": 1.7272, + "step": 64200 + }, + { + "epoch": 3.04, + "learning_rate": 4.0729760206375404e-07, + "loss": 1.722, + "step": 64300 + }, + { + "epoch": 3.04, + "learning_rate": 4.0348992353759657e-07, + "loss": 1.6016, + "step": 64400 + }, + { + "epoch": 3.05, + "learning_rate": 3.9969735925306884e-07, + "loss": 1.5948, + "step": 64500 + }, + { + "epoch": 3.05, + "learning_rate": 3.95919961486693e-07, + "loss": 1.4295, + "step": 64600 + }, + { + "epoch": 3.06, + "learning_rate": 3.9215778230593563e-07, + "loss": 1.6671, + "step": 64700 + }, + { + "epoch": 3.06, + "learning_rate": 3.8841087356849295e-07, + "loss": 1.6863, + "step": 64800 + }, + { + "epoch": 3.07, + "learning_rate": 3.846792869215725e-07, + "loss": 1.7321, + "step": 64900 + }, + { + "epoch": 3.07, + "learning_rate": 3.8096307380118334e-07, + "loss": 1.6549, + "step": 65000 + }, + { + "epoch": 3.08, + "learning_rate": 3.7726228543142645e-07, + "loss": 1.672, + "step": 65100 + }, + { + "epoch": 3.08, + "learning_rate": 3.7357697282378916e-07, + "loss": 1.5972, + "step": 65200 + }, + { + "epoch": 3.09, + "learning_rate": 3.6990718677644103e-07, + "loss": 1.6149, + "step": 65300 + }, + { + "epoch": 3.09, + "learning_rate": 3.662529778735354e-07, + "loss": 1.6483, + "step": 65400 + }, + { + "epoch": 3.1, + "learning_rate": 3.6261439648450973e-07, + "loss": 1.5785, + "step": 65500 + }, + { + "epoch": 3.1, + "learning_rate": 3.5899149276339345e-07, + "loss": 1.7107, + "step": 65600 + }, + { + "epoch": 3.11, + "learning_rate": 3.553843166481148e-07, + "loss": 1.6453, + "step": 65700 + }, + { + "epoch": 3.11, + "learning_rate": 3.517929178598151e-07, + "loss": 1.7473, + "step": 65800 + }, + { + "epoch": 3.11, + "learning_rate": 3.4821734590216027e-07, + "loss": 1.6577, + "step": 65900 + }, + { + "epoch": 3.12, + "learning_rate": 3.4465765006066065e-07, + "loss": 1.6899, + "step": 66000 + }, + { + "epoch": 3.12, + "learning_rate": 3.4111387940199014e-07, + "loss": 1.5638, + "step": 66100 + }, + { + "epoch": 3.13, + "learning_rate": 3.3758608277331257e-07, + "loss": 1.7071, + "step": 66200 + }, + { + "epoch": 3.13, + "learning_rate": 3.3407430880160433e-07, + "loss": 1.4997, + "step": 66300 + }, + { + "epoch": 3.14, + "learning_rate": 3.3057860589298746e-07, + "loss": 1.4916, + "step": 66400 + }, + { + "epoch": 3.14, + "learning_rate": 3.2709902223206136e-07, + "loss": 1.7187, + "step": 66500 + }, + { + "epoch": 3.15, + "learning_rate": 3.2363560578123807e-07, + "loss": 1.6423, + "step": 66600 + }, + { + "epoch": 3.15, + "learning_rate": 3.2018840428008176e-07, + "loss": 1.6532, + "step": 66700 + }, + { + "epoch": 3.16, + "learning_rate": 3.167574652446497e-07, + "loss": 1.7428, + "step": 66800 + }, + { + "epoch": 3.16, + "learning_rate": 3.133428359668401e-07, + "loss": 1.6502, + "step": 66900 + }, + { + "epoch": 3.17, + "learning_rate": 3.099445635137365e-07, + "loss": 1.6433, + "step": 67000 + }, + { + "epoch": 3.17, + "learning_rate": 3.0656269472696136e-07, + "loss": 1.8004, + "step": 67100 + }, + { + "epoch": 3.18, + "learning_rate": 3.031972762220291e-07, + "loss": 1.5027, + "step": 67200 + }, + { + "epoch": 3.18, + "learning_rate": 2.998483543877065e-07, + "loss": 1.7288, + "step": 67300 + }, + { + "epoch": 3.19, + "learning_rate": 2.965159753853681e-07, + "loss": 1.7243, + "step": 67400 + }, + { + "epoch": 3.19, + "learning_rate": 2.93200185148364e-07, + "loss": 1.7439, + "step": 67500 + }, + { + "epoch": 3.2, + "learning_rate": 2.8990102938138685e-07, + "loss": 1.6872, + "step": 67600 + }, + { + "epoch": 3.2, + "learning_rate": 2.866185535598389e-07, + "loss": 1.588, + "step": 67700 + }, + { + "epoch": 3.2, + "learning_rate": 2.83352802929207e-07, + "loss": 1.6774, + "step": 67800 + }, + { + "epoch": 3.21, + "learning_rate": 2.801038225044403e-07, + "loss": 1.663, + "step": 67900 + }, + { + "epoch": 3.21, + "learning_rate": 2.7687165706932636e-07, + "loss": 1.6019, + "step": 68000 + }, + { + "epoch": 3.22, + "learning_rate": 2.7365635117587673e-07, + "loss": 1.6143, + "step": 68100 + }, + { + "epoch": 3.22, + "learning_rate": 2.704579491437113e-07, + "loss": 1.7017, + "step": 68200 + }, + { + "epoch": 3.23, + "learning_rate": 2.672764950594491e-07, + "loss": 1.5511, + "step": 68300 + }, + { + "epoch": 3.23, + "learning_rate": 2.641120327760981e-07, + "loss": 1.6588, + "step": 68400 + }, + { + "epoch": 3.24, + "learning_rate": 2.609646059124529e-07, + "loss": 1.6361, + "step": 68500 + }, + { + "epoch": 3.24, + "learning_rate": 2.578342578524922e-07, + "loss": 1.6632, + "step": 68600 + }, + { + "epoch": 3.25, + "learning_rate": 2.54721031744782e-07, + "loss": 1.7003, + "step": 68700 + }, + { + "epoch": 3.25, + "eval_loss": 1.2819619178771973, + "eval_runtime": 159.0772, + "eval_samples_per_second": 5.84, + "eval_steps_per_second": 5.84, + "step": 68770 + }, + { + "epoch": 3.25, + "learning_rate": 2.516249705018797e-07, + "loss": 1.6595, + "step": 68800 + }, + { + "epoch": 3.26, + "learning_rate": 2.485461167997429e-07, + "loss": 1.5922, + "step": 68900 + }, + { + "epoch": 3.26, + "learning_rate": 2.4548451307714115e-07, + "loss": 1.6189, + "step": 69000 + }, + { + "epoch": 3.27, + "learning_rate": 2.4244020153507233e-07, + "loss": 1.697, + "step": 69100 + }, + { + "epoch": 3.27, + "learning_rate": 2.394132241361782e-07, + "loss": 1.5139, + "step": 69200 + }, + { + "epoch": 3.28, + "learning_rate": 2.364036226041679e-07, + "loss": 1.5705, + "step": 69300 + }, + { + "epoch": 3.28, + "learning_rate": 2.334114384232437e-07, + "loss": 1.7863, + "step": 69400 + }, + { + "epoch": 3.28, + "learning_rate": 2.3043671283752649e-07, + "loss": 1.7356, + "step": 69500 + }, + { + "epoch": 3.29, + "learning_rate": 2.274794868504891e-07, + "loss": 1.7676, + "step": 69600 + }, + { + "epoch": 3.29, + "learning_rate": 2.2453980122439088e-07, + "loss": 1.6916, + "step": 69700 + }, + { + "epoch": 3.3, + "learning_rate": 2.2161769647971637e-07, + "loss": 1.6342, + "step": 69800 + }, + { + "epoch": 3.3, + "learning_rate": 2.1871321289461466e-07, + "loss": 1.6444, + "step": 69900 + }, + { + "epoch": 3.31, + "learning_rate": 2.158263905043462e-07, + "loss": 1.5611, + "step": 70000 + }, + { + "epoch": 3.31, + "learning_rate": 2.1295726910073137e-07, + "loss": 1.5657, + "step": 70100 + }, + { + "epoch": 3.32, + "learning_rate": 2.101058882316e-07, + "loss": 1.696, + "step": 70200 + }, + { + "epoch": 3.32, + "learning_rate": 2.072722872002473e-07, + "loss": 1.524, + "step": 70300 + }, + { + "epoch": 3.33, + "learning_rate": 2.0445650506489188e-07, + "loss": 1.5935, + "step": 70400 + }, + { + "epoch": 3.33, + "learning_rate": 2.016585806381388e-07, + "loss": 1.6433, + "step": 70500 + }, + { + "epoch": 3.34, + "learning_rate": 1.988785524864421e-07, + "loss": 1.7118, + "step": 70600 + }, + { + "epoch": 3.34, + "learning_rate": 1.9611645892957414e-07, + "loss": 1.6446, + "step": 70700 + }, + { + "epoch": 3.35, + "learning_rate": 1.9337233804009918e-07, + "loss": 1.6606, + "step": 70800 + }, + { + "epoch": 3.35, + "learning_rate": 1.9064622764284618e-07, + "loss": 1.7027, + "step": 70900 + }, + { + "epoch": 3.36, + "learning_rate": 1.8793816531438797e-07, + "loss": 1.5923, + "step": 71000 + }, + { + "epoch": 3.36, + "learning_rate": 1.8524818838252422e-07, + "loss": 1.6598, + "step": 71100 + }, + { + "epoch": 3.37, + "learning_rate": 1.8257633392576656e-07, + "loss": 1.6341, + "step": 71200 + }, + { + "epoch": 3.37, + "learning_rate": 1.7992263877282706e-07, + "loss": 1.64, + "step": 71300 + }, + { + "epoch": 3.37, + "learning_rate": 1.7728713950211067e-07, + "loss": 1.6781, + "step": 71400 + }, + { + "epoch": 3.38, + "learning_rate": 1.7466987244121086e-07, + "loss": 1.5765, + "step": 71500 + }, + { + "epoch": 3.38, + "learning_rate": 1.7207087366641055e-07, + "loss": 1.617, + "step": 71600 + }, + { + "epoch": 3.39, + "learning_rate": 1.6949017900218217e-07, + "loss": 1.6218, + "step": 71700 + }, + { + "epoch": 3.39, + "learning_rate": 1.669278240206954e-07, + "loss": 1.6219, + "step": 71800 + }, + { + "epoch": 3.4, + "learning_rate": 1.64383844041327e-07, + "loss": 1.6668, + "step": 71900 + }, + { + "epoch": 3.4, + "learning_rate": 1.618582741301735e-07, + "loss": 1.642, + "step": 72000 + }, + { + "epoch": 3.41, + "learning_rate": 1.593511490995681e-07, + "loss": 1.6472, + "step": 72100 + }, + { + "epoch": 3.41, + "learning_rate": 1.568625035076e-07, + "loss": 1.7778, + "step": 72200 + }, + { + "epoch": 3.42, + "learning_rate": 1.5439237165763936e-07, + "loss": 1.6262, + "step": 72300 + }, + { + "epoch": 3.42, + "learning_rate": 1.519407875978636e-07, + "loss": 1.7338, + "step": 72400 + }, + { + "epoch": 3.43, + "learning_rate": 1.4950778512078788e-07, + "loss": 1.6323, + "step": 72500 + }, + { + "epoch": 3.43, + "learning_rate": 1.4709339776280102e-07, + "loss": 1.5763, + "step": 72600 + }, + { + "epoch": 3.44, + "learning_rate": 1.4469765880370056e-07, + "loss": 1.5935, + "step": 72700 + }, + { + "epoch": 3.44, + "learning_rate": 1.423206012662359e-07, + "loss": 1.612, + "step": 72800 + }, + { + "epoch": 3.45, + "learning_rate": 1.399622579156526e-07, + "loss": 1.6644, + "step": 72900 + }, + { + "epoch": 3.45, + "learning_rate": 1.376226612592413e-07, + "loss": 1.5736, + "step": 73000 + }, + { + "epoch": 3.46, + "learning_rate": 1.3530184354588837e-07, + "loss": 1.7126, + "step": 73100 + }, + { + "epoch": 3.46, + "learning_rate": 1.329998367656325e-07, + "loss": 1.6738, + "step": 73200 + }, + { + "epoch": 3.46, + "learning_rate": 1.3071667264922282e-07, + "loss": 1.5867, + "step": 73300 + }, + { + "epoch": 3.47, + "learning_rate": 1.284523826676835e-07, + "loss": 1.7888, + "step": 73400 + }, + { + "epoch": 3.47, + "learning_rate": 1.2620699803187724e-07, + "loss": 1.5483, + "step": 73500 + }, + { + "epoch": 3.48, + "learning_rate": 1.239805496920764e-07, + "loss": 1.6639, + "step": 73600 + }, + { + "epoch": 3.48, + "learning_rate": 1.2177306833753744e-07, + "loss": 1.6227, + "step": 73700 + }, + { + "epoch": 3.49, + "learning_rate": 1.1958458439607562e-07, + "loss": 1.5036, + "step": 73800 + }, + { + "epoch": 3.49, + "learning_rate": 1.1741512803364751e-07, + "loss": 1.6368, + "step": 73900 + }, + { + "epoch": 3.5, + "learning_rate": 1.1526472915393399e-07, + "loss": 1.6915, + "step": 74000 + }, + { + "epoch": 3.5, + "eval_loss": 1.2813829183578491, + "eval_runtime": 158.5635, + "eval_samples_per_second": 5.859, + "eval_steps_per_second": 5.859, + "step": 74060 + }, + { + "epoch": 3.5, + "learning_rate": 1.1313341739792921e-07, + "loss": 1.7454, + "step": 74100 + }, + { + "epoch": 3.51, + "learning_rate": 1.1102122214353044e-07, + "loss": 1.575, + "step": 74200 + }, + { + "epoch": 3.51, + "learning_rate": 1.0892817250513476e-07, + "loss": 1.6332, + "step": 74300 + }, + { + "epoch": 3.52, + "learning_rate": 1.0685429733323632e-07, + "loss": 1.5274, + "step": 74400 + }, + { + "epoch": 3.52, + "learning_rate": 1.0479962521403036e-07, + "loss": 1.6464, + "step": 74500 + }, + { + "epoch": 3.53, + "learning_rate": 1.0276418446901703e-07, + "loss": 1.7318, + "step": 74600 + }, + { + "epoch": 3.53, + "learning_rate": 1.0074800315461302e-07, + "loss": 1.5266, + "step": 74700 + }, + { + "epoch": 3.54, + "learning_rate": 9.875110906176337e-08, + "loss": 1.6372, + "step": 74800 + }, + { + "epoch": 3.54, + "learning_rate": 9.67735297155593e-08, + "loss": 1.557, + "step": 74900 + }, + { + "epoch": 3.54, + "learning_rate": 9.481529237485865e-08, + "loss": 1.6721, + "step": 75000 + }, + { + "epoch": 3.55, + "learning_rate": 9.287642403190948e-08, + "loss": 1.6316, + "step": 75100 + }, + { + "epoch": 3.55, + "learning_rate": 9.095695141197919e-08, + "loss": 1.6914, + "step": 75200 + }, + { + "epoch": 3.56, + "learning_rate": 8.905690097298507e-08, + "loss": 1.6764, + "step": 75300 + }, + { + "epoch": 3.56, + "learning_rate": 8.717629890512967e-08, + "loss": 1.5671, + "step": 75400 + }, + { + "epoch": 3.57, + "learning_rate": 8.531517113054111e-08, + "loss": 1.607, + "step": 75500 + }, + { + "epoch": 3.57, + "learning_rate": 8.347354330291401e-08, + "loss": 1.4986, + "step": 75600 + }, + { + "epoch": 3.58, + "learning_rate": 8.165144080715676e-08, + "loss": 1.4958, + "step": 75700 + }, + { + "epoch": 3.58, + "learning_rate": 7.984888875904162e-08, + "loss": 1.6586, + "step": 75800 + }, + { + "epoch": 3.59, + "learning_rate": 7.80659120048588e-08, + "loss": 1.6024, + "step": 75900 + }, + { + "epoch": 3.59, + "learning_rate": 7.6302535121073e-08, + "loss": 1.7341, + "step": 76000 + }, + { + "epoch": 3.6, + "learning_rate": 7.45587824139855e-08, + "loss": 1.7988, + "step": 76100 + }, + { + "epoch": 3.6, + "learning_rate": 7.283467791939902e-08, + "loss": 1.7466, + "step": 76200 + }, + { + "epoch": 3.61, + "learning_rate": 7.113024540228619e-08, + "loss": 1.6675, + "step": 76300 + }, + { + "epoch": 3.61, + "learning_rate": 6.94455083564619e-08, + "loss": 1.5927, + "step": 76400 + }, + { + "epoch": 3.62, + "learning_rate": 6.778049000426007e-08, + "loss": 1.5928, + "step": 76500 + }, + { + "epoch": 3.62, + "learning_rate": 6.613521329621214e-08, + "loss": 1.6168, + "step": 76600 + }, + { + "epoch": 3.63, + "learning_rate": 6.45097009107331e-08, + "loss": 1.5762, + "step": 76700 + }, + { + "epoch": 3.63, + "learning_rate": 6.29039752538062e-08, + "loss": 1.5915, + "step": 76800 + }, + { + "epoch": 3.63, + "learning_rate": 6.1318058458676e-08, + "loss": 1.7576, + "step": 76900 + }, + { + "epoch": 3.64, + "learning_rate": 5.975197238554265e-08, + "loss": 1.4722, + "step": 77000 + }, + { + "epoch": 3.64, + "learning_rate": 5.8205738621260776e-08, + "loss": 1.6109, + "step": 77100 + }, + { + "epoch": 3.65, + "learning_rate": 5.6679378479040756e-08, + "loss": 1.6115, + "step": 77200 + }, + { + "epoch": 3.65, + "learning_rate": 5.517291299815724e-08, + "loss": 1.6724, + "step": 77300 + }, + { + "epoch": 3.66, + "learning_rate": 5.3686362943657084e-08, + "loss": 1.6488, + "step": 77400 + }, + { + "epoch": 3.66, + "learning_rate": 5.221974880607389e-08, + "loss": 1.7043, + "step": 77500 + }, + { + "epoch": 3.67, + "learning_rate": 5.077309080114573e-08, + "loss": 1.7738, + "step": 77600 + }, + { + "epoch": 3.67, + "learning_rate": 4.934640886953656e-08, + "loss": 1.5892, + "step": 77700 + }, + { + "epoch": 3.68, + "learning_rate": 4.7939722676560435e-08, + "loss": 1.7314, + "step": 77800 + }, + { + "epoch": 3.68, + "learning_rate": 4.655305161191187e-08, + "loss": 1.6078, + "step": 77900 + }, + { + "epoch": 3.69, + "learning_rate": 4.518641478939706e-08, + "loss": 1.6222, + "step": 78000 + }, + { + "epoch": 3.69, + "learning_rate": 4.3839831046671777e-08, + "loss": 1.5819, + "step": 78100 + }, + { + "epoch": 3.7, + "learning_rate": 4.251331894498106e-08, + "loss": 1.5665, + "step": 78200 + }, + { + "epoch": 3.7, + "learning_rate": 4.1206896768903104e-08, + "loss": 1.7198, + "step": 78300 + }, + { + "epoch": 3.71, + "learning_rate": 3.992058252609776e-08, + "loss": 1.6506, + "step": 78400 + }, + { + "epoch": 3.71, + "learning_rate": 3.865439394705811e-08, + "loss": 1.6282, + "step": 78500 + }, + { + "epoch": 3.72, + "learning_rate": 3.740834848486613e-08, + "loss": 1.5705, + "step": 78600 + }, + { + "epoch": 3.72, + "learning_rate": 3.6182463314951564e-08, + "loss": 1.6093, + "step": 78700 + }, + { + "epoch": 3.72, + "learning_rate": 3.497675533485645e-08, + "loss": 1.7513, + "step": 78800 + }, + { + "epoch": 3.73, + "learning_rate": 3.379124116400079e-08, + "loss": 1.7255, + "step": 78900 + }, + { + "epoch": 3.73, + "learning_rate": 3.262593714345441e-08, + "loss": 1.6083, + "step": 79000 + }, + { + "epoch": 3.74, + "learning_rate": 3.148085933571115e-08, + "loss": 1.6724, + "step": 79100 + }, + { + "epoch": 3.74, + "learning_rate": 3.0356023524468204e-08, + "loss": 1.688, + "step": 79200 + }, + { + "epoch": 3.75, + "learning_rate": 2.9251445214407592e-08, + "loss": 1.5757, + "step": 79300 + }, + { + "epoch": 3.75, + "eval_loss": 1.2812554836273193, + "eval_runtime": 158.4775, + "eval_samples_per_second": 5.862, + "eval_steps_per_second": 5.862, + "step": 79350 + }, + { + "epoch": 3.75, + "learning_rate": 2.816713963098305e-08, + "loss": 1.6287, + "step": 79400 + }, + { + "epoch": 3.76, + "learning_rate": 2.7103121720210178e-08, + "loss": 1.674, + "step": 79500 + }, + { + "epoch": 3.76, + "learning_rate": 2.6059406148460086e-08, + "loss": 1.6933, + "step": 79600 + }, + { + "epoch": 3.77, + "learning_rate": 2.5036007302257413e-08, + "loss": 1.7177, + "step": 79700 + }, + { + "epoch": 3.77, + "learning_rate": 2.4032939288082312e-08, + "loss": 1.5981, + "step": 79800 + }, + { + "epoch": 3.78, + "learning_rate": 2.30502159321756e-08, + "loss": 1.6676, + "step": 79900 + }, + { + "epoch": 3.78, + "learning_rate": 2.2087850780348086e-08, + "loss": 1.6508, + "step": 80000 + } + ], + "logging_steps": 100, + "max_steps": 84628, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 5000, + "total_flos": 4.738333907799245e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}