{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004484304932735426, "grad_norm": 4.696539476451585, "learning_rate": 1.3452914798206278e-08, "loss": 0.9912, "step": 1 }, { "epoch": 0.004484304932735426, "grad_norm": 5.089904667658368, "learning_rate": 1.345291479820628e-07, "loss": 1.0341, "step": 10 }, { "epoch": 0.008968609865470852, "grad_norm": 5.546828630388097, "learning_rate": 2.690582959641256e-07, "loss": 1.0502, "step": 20 }, { "epoch": 0.013452914798206279, "grad_norm": 4.113849381101499, "learning_rate": 4.0358744394618834e-07, "loss": 1.0386, "step": 30 }, { "epoch": 0.017937219730941704, "grad_norm": 3.6548963622814887, "learning_rate": 5.381165919282512e-07, "loss": 1.0282, "step": 40 }, { "epoch": 0.02242152466367713, "grad_norm": 2.157564670206396, "learning_rate": 6.72645739910314e-07, "loss": 0.9574, "step": 50 }, { "epoch": 0.026905829596412557, "grad_norm": 2.0184475272019555, "learning_rate": 8.071748878923767e-07, "loss": 0.9263, "step": 60 }, { "epoch": 0.03139013452914798, "grad_norm": 1.7894937443172652, "learning_rate": 9.417040358744395e-07, "loss": 0.9253, "step": 70 }, { "epoch": 0.03587443946188341, "grad_norm": 1.6533764414432808, "learning_rate": 1.0762331838565023e-06, "loss": 0.9106, "step": 80 }, { "epoch": 0.04035874439461883, "grad_norm": 1.9561381307359194, "learning_rate": 1.2107623318385651e-06, "loss": 0.8713, "step": 90 }, { "epoch": 0.04484304932735426, "grad_norm": 1.5478472557018526, "learning_rate": 1.345291479820628e-06, "loss": 0.8741, "step": 100 }, { "epoch": 0.04484304932735426, "eval_loss": 0.8599640727043152, "eval_runtime": 430.7233, "eval_samples_per_second": 116.263, "eval_steps_per_second": 1.818, "step": 100 }, { "epoch": 0.04932735426008968, "grad_norm": 1.5759592930264636, "learning_rate": 1.4798206278026905e-06, "loss": 0.8381, "step": 110 }, { "epoch": 0.053811659192825115, "grad_norm": 1.5446577353242628, "learning_rate": 1.6143497757847533e-06, "loss": 0.8151, "step": 120 }, { "epoch": 0.05829596412556054, "grad_norm": 1.6899841974229757, "learning_rate": 1.7488789237668162e-06, "loss": 0.8309, "step": 130 }, { "epoch": 0.06278026905829596, "grad_norm": 1.6274283098945213, "learning_rate": 1.883408071748879e-06, "loss": 0.8509, "step": 140 }, { "epoch": 0.06726457399103139, "grad_norm": 1.7690619100525546, "learning_rate": 2.0179372197309418e-06, "loss": 0.8057, "step": 150 }, { "epoch": 0.07174887892376682, "grad_norm": 1.866473004768342, "learning_rate": 2.1524663677130046e-06, "loss": 0.8236, "step": 160 }, { "epoch": 0.07623318385650224, "grad_norm": 1.5528009019380091, "learning_rate": 2.2869955156950674e-06, "loss": 0.7936, "step": 170 }, { "epoch": 0.08071748878923767, "grad_norm": 1.8924349879943885, "learning_rate": 2.4215246636771302e-06, "loss": 0.8054, "step": 180 }, { "epoch": 0.08520179372197309, "grad_norm": 1.5998254884542162, "learning_rate": 2.556053811659193e-06, "loss": 0.7971, "step": 190 }, { "epoch": 0.08968609865470852, "grad_norm": 1.553085624058612, "learning_rate": 2.690582959641256e-06, "loss": 0.8038, "step": 200 }, { "epoch": 0.08968609865470852, "eval_loss": 0.8094644546508789, "eval_runtime": 412.1717, "eval_samples_per_second": 121.495, "eval_steps_per_second": 1.9, "step": 200 }, { "epoch": 0.09417040358744394, "grad_norm": 1.6621378080442881, "learning_rate": 2.8251121076233187e-06, "loss": 0.7815, "step": 210 }, { "epoch": 0.09865470852017937, "grad_norm": 1.5875832641605891, "learning_rate": 2.959641255605381e-06, "loss": 0.8088, "step": 220 }, { "epoch": 0.1031390134529148, "grad_norm": 1.6006597094640902, "learning_rate": 2.99990995533251e-06, "loss": 0.8141, "step": 230 }, { "epoch": 0.10762331838565023, "grad_norm": 1.7932554350094232, "learning_rate": 2.9994689462512194e-06, "loss": 0.7834, "step": 240 }, { "epoch": 0.11210762331838565, "grad_norm": 1.6444723214299724, "learning_rate": 2.998660541859271e-06, "loss": 0.7797, "step": 250 }, { "epoch": 0.11659192825112108, "grad_norm": 1.790145213655978, "learning_rate": 2.9974849402294452e-06, "loss": 0.8046, "step": 260 }, { "epoch": 0.1210762331838565, "grad_norm": 1.8694283184605, "learning_rate": 2.9959424294040703e-06, "loss": 0.7802, "step": 270 }, { "epoch": 0.12556053811659193, "grad_norm": 1.6030839509233756, "learning_rate": 2.9940333873244464e-06, "loss": 0.8032, "step": 280 }, { "epoch": 0.13004484304932734, "grad_norm": 1.664910362160235, "learning_rate": 2.991758281738245e-06, "loss": 0.7802, "step": 290 }, { "epoch": 0.13452914798206278, "grad_norm": 1.6726792291262853, "learning_rate": 2.989117670084902e-06, "loss": 0.7937, "step": 300 }, { "epoch": 0.13452914798206278, "eval_loss": 0.7789004445075989, "eval_runtime": 410.6605, "eval_samples_per_second": 121.943, "eval_steps_per_second": 1.907, "step": 300 }, { "epoch": 0.13901345291479822, "grad_norm": 1.4685211047526556, "learning_rate": 2.986112199359036e-06, "loss": 0.7486, "step": 310 }, { "epoch": 0.14349775784753363, "grad_norm": 2.0076694355781575, "learning_rate": 2.9827426059519237e-06, "loss": 0.808, "step": 320 }, { "epoch": 0.14798206278026907, "grad_norm": 1.557780179088859, "learning_rate": 2.9790097154710697e-06, "loss": 0.7849, "step": 330 }, { "epoch": 0.15246636771300448, "grad_norm": 1.3610248283116362, "learning_rate": 2.9749144425379216e-06, "loss": 0.7696, "step": 340 }, { "epoch": 0.15695067264573992, "grad_norm": 1.5050628258310632, "learning_rate": 2.9704577905637718e-06, "loss": 0.7497, "step": 350 }, { "epoch": 0.16143497757847533, "grad_norm": 1.4313536098763806, "learning_rate": 2.9656408515039017e-06, "loss": 0.7544, "step": 360 }, { "epoch": 0.16591928251121077, "grad_norm": 1.6003065628553548, "learning_rate": 2.9604648055900368e-06, "loss": 0.7648, "step": 370 }, { "epoch": 0.17040358744394618, "grad_norm": 1.633334409956319, "learning_rate": 2.9549309210411697e-06, "loss": 0.7471, "step": 380 }, { "epoch": 0.17488789237668162, "grad_norm": 1.5700271693529286, "learning_rate": 2.949040553752826e-06, "loss": 0.8009, "step": 390 }, { "epoch": 0.17937219730941703, "grad_norm": 1.4854276734758955, "learning_rate": 2.9427951469648425e-06, "loss": 0.7712, "step": 400 }, { "epoch": 0.17937219730941703, "eval_loss": 0.7643527388572693, "eval_runtime": 413.4678, "eval_samples_per_second": 121.115, "eval_steps_per_second": 1.894, "step": 400 }, { "epoch": 0.18385650224215247, "grad_norm": 1.4160940764229815, "learning_rate": 2.936196230907755e-06, "loss": 0.7532, "step": 410 }, { "epoch": 0.18834080717488788, "grad_norm": 1.4265290618310995, "learning_rate": 2.929245422427861e-06, "loss": 0.7703, "step": 420 }, { "epoch": 0.19282511210762332, "grad_norm": 1.6899882763333507, "learning_rate": 2.9219444245910674e-06, "loss": 0.7919, "step": 430 }, { "epoch": 0.19730941704035873, "grad_norm": 1.4186337044303068, "learning_rate": 2.9142950262656098e-06, "loss": 0.7477, "step": 440 }, { "epoch": 0.20179372197309417, "grad_norm": 1.4178331376670448, "learning_rate": 2.9062991016837496e-06, "loss": 0.7734, "step": 450 }, { "epoch": 0.2062780269058296, "grad_norm": 1.4503162574851487, "learning_rate": 2.897958609982556e-06, "loss": 0.7447, "step": 460 }, { "epoch": 0.21076233183856502, "grad_norm": 1.558520612711291, "learning_rate": 2.8892755947238818e-06, "loss": 0.741, "step": 470 }, { "epoch": 0.21524663677130046, "grad_norm": 1.4382572158325275, "learning_rate": 2.8802521833936595e-06, "loss": 0.7563, "step": 480 }, { "epoch": 0.21973094170403587, "grad_norm": 1.5964216489171685, "learning_rate": 2.870890586880629e-06, "loss": 0.7554, "step": 490 }, { "epoch": 0.2242152466367713, "grad_norm": 1.496069010720812, "learning_rate": 2.8611930989346322e-06, "loss": 0.7393, "step": 500 }, { "epoch": 0.2242152466367713, "eval_loss": 0.7564548254013062, "eval_runtime": 408.8965, "eval_samples_per_second": 122.469, "eval_steps_per_second": 1.915, "step": 500 }, { "epoch": 0.22869955156950672, "grad_norm": 1.4866290735466012, "learning_rate": 2.851162095604607e-06, "loss": 0.7499, "step": 510 }, { "epoch": 0.23318385650224216, "grad_norm": 1.3341919240907245, "learning_rate": 2.8408000346564136e-06, "loss": 0.7524, "step": 520 }, { "epoch": 0.23766816143497757, "grad_norm": 1.6374942242171213, "learning_rate": 2.8301094549706405e-06, "loss": 0.7386, "step": 530 }, { "epoch": 0.242152466367713, "grad_norm": 1.6225803035616944, "learning_rate": 2.8190929759205366e-06, "loss": 0.7616, "step": 540 }, { "epoch": 0.24663677130044842, "grad_norm": 1.4683777464043755, "learning_rate": 2.807753296730219e-06, "loss": 0.7564, "step": 550 }, { "epoch": 0.25112107623318386, "grad_norm": 1.350460716883926, "learning_rate": 2.7960931958133183e-06, "loss": 0.7424, "step": 560 }, { "epoch": 0.2556053811659193, "grad_norm": 1.522474854464212, "learning_rate": 2.7841155300922202e-06, "loss": 0.7331, "step": 570 }, { "epoch": 0.2600896860986547, "grad_norm": 1.448720887976205, "learning_rate": 2.7718232342980693e-06, "loss": 0.7657, "step": 580 }, { "epoch": 0.2645739910313901, "grad_norm": 1.6744619426337854, "learning_rate": 2.759219320251714e-06, "loss": 0.7363, "step": 590 }, { "epoch": 0.26905829596412556, "grad_norm": 1.3585539591402243, "learning_rate": 2.7463068761257554e-06, "loss": 0.7458, "step": 600 }, { "epoch": 0.26905829596412556, "eval_loss": 0.7505608797073364, "eval_runtime": 408.9234, "eval_samples_per_second": 122.461, "eval_steps_per_second": 1.915, "step": 600 }, { "epoch": 0.273542600896861, "grad_norm": 1.580932873164111, "learning_rate": 2.7330890656878943e-06, "loss": 0.7565, "step": 610 }, { "epoch": 0.27802690582959644, "grad_norm": 1.5329888412189265, "learning_rate": 2.7195691275257547e-06, "loss": 0.7457, "step": 620 }, { "epoch": 0.2825112107623318, "grad_norm": 1.6754413400622026, "learning_rate": 2.7057503742533753e-06, "loss": 0.7392, "step": 630 }, { "epoch": 0.28699551569506726, "grad_norm": 1.6247897070260917, "learning_rate": 2.691636191699562e-06, "loss": 0.758, "step": 640 }, { "epoch": 0.2914798206278027, "grad_norm": 1.42356323236888, "learning_rate": 2.6772300380783013e-06, "loss": 0.7626, "step": 650 }, { "epoch": 0.29596412556053814, "grad_norm": 1.4955853270730488, "learning_rate": 2.662535443141443e-06, "loss": 0.7355, "step": 660 }, { "epoch": 0.3004484304932735, "grad_norm": 1.4879073313151545, "learning_rate": 2.647556007313847e-06, "loss": 0.7545, "step": 670 }, { "epoch": 0.30493273542600896, "grad_norm": 1.4153755477305148, "learning_rate": 2.6322954008112213e-06, "loss": 0.7378, "step": 680 }, { "epoch": 0.3094170403587444, "grad_norm": 1.4019993036978922, "learning_rate": 2.616757362740855e-06, "loss": 0.7387, "step": 690 }, { "epoch": 0.31390134529147984, "grad_norm": 1.5335241758091316, "learning_rate": 2.600945700185474e-06, "loss": 0.7694, "step": 700 }, { "epoch": 0.31390134529147984, "eval_loss": 0.7457958459854126, "eval_runtime": 408.7761, "eval_samples_per_second": 122.505, "eval_steps_per_second": 1.915, "step": 700 }, { "epoch": 0.3183856502242152, "grad_norm": 1.47263429505246, "learning_rate": 2.5848642872704417e-06, "loss": 0.7246, "step": 710 }, { "epoch": 0.32286995515695066, "grad_norm": 1.5062835613914285, "learning_rate": 2.5685170642145337e-06, "loss": 0.7338, "step": 720 }, { "epoch": 0.3273542600896861, "grad_norm": 1.6182138547104117, "learning_rate": 2.5519080363645134e-06, "loss": 0.73, "step": 730 }, { "epoch": 0.33183856502242154, "grad_norm": 1.3515300425343295, "learning_rate": 2.53504127321376e-06, "loss": 0.7299, "step": 740 }, { "epoch": 0.336322869955157, "grad_norm": 1.5798782493243635, "learning_rate": 2.517920907405168e-06, "loss": 0.7293, "step": 750 }, { "epoch": 0.34080717488789236, "grad_norm": 1.4549259580353344, "learning_rate": 2.5005511337185824e-06, "loss": 0.7621, "step": 760 }, { "epoch": 0.3452914798206278, "grad_norm": 1.456599605633329, "learning_rate": 2.4829362080430077e-06, "loss": 0.7438, "step": 770 }, { "epoch": 0.34977578475336324, "grad_norm": 1.4128813340833153, "learning_rate": 2.4650804463338406e-06, "loss": 0.7413, "step": 780 }, { "epoch": 0.3542600896860987, "grad_norm": 1.5613737124434628, "learning_rate": 2.4469882235553887e-06, "loss": 0.7477, "step": 790 }, { "epoch": 0.35874439461883406, "grad_norm": 1.6383373422678345, "learning_rate": 2.4286639726089293e-06, "loss": 0.713, "step": 800 }, { "epoch": 0.35874439461883406, "eval_loss": 0.7421520352363586, "eval_runtime": 408.0589, "eval_samples_per_second": 122.72, "eval_steps_per_second": 1.919, "step": 800 }, { "epoch": 0.3632286995515695, "grad_norm": 1.3492102003393152, "learning_rate": 2.4101121832465754e-06, "loss": 0.7185, "step": 810 }, { "epoch": 0.36771300448430494, "grad_norm": 1.4117655797526263, "learning_rate": 2.3913374009712084e-06, "loss": 0.7379, "step": 820 }, { "epoch": 0.3721973094170404, "grad_norm": 1.5281693242796246, "learning_rate": 2.3723442259227547e-06, "loss": 0.7406, "step": 830 }, { "epoch": 0.37668161434977576, "grad_norm": 1.6990323130848894, "learning_rate": 2.3531373117510695e-06, "loss": 0.7388, "step": 840 }, { "epoch": 0.3811659192825112, "grad_norm": 1.476162200960684, "learning_rate": 2.33372136447572e-06, "loss": 0.7434, "step": 850 }, { "epoch": 0.38565022421524664, "grad_norm": 1.3930484173784414, "learning_rate": 2.3141011413329244e-06, "loss": 0.7372, "step": 860 }, { "epoch": 0.3901345291479821, "grad_norm": 1.4071716332679987, "learning_rate": 2.2942814496099532e-06, "loss": 0.7531, "step": 870 }, { "epoch": 0.39461883408071746, "grad_norm": 1.5479232446038012, "learning_rate": 2.274267145467259e-06, "loss": 0.7216, "step": 880 }, { "epoch": 0.3991031390134529, "grad_norm": 1.4255077423798548, "learning_rate": 2.254063132748637e-06, "loss": 0.7343, "step": 890 }, { "epoch": 0.40358744394618834, "grad_norm": 1.57276996130409, "learning_rate": 2.2336743617797006e-06, "loss": 0.7347, "step": 900 }, { "epoch": 0.40358744394618834, "eval_loss": 0.7386789321899414, "eval_runtime": 408.1839, "eval_samples_per_second": 122.682, "eval_steps_per_second": 1.918, "step": 900 }, { "epoch": 0.4080717488789238, "grad_norm": 1.4568107529063017, "learning_rate": 2.213105828154964e-06, "loss": 0.7266, "step": 910 }, { "epoch": 0.4125560538116592, "grad_norm": 1.374198091231606, "learning_rate": 2.192362571513841e-06, "loss": 0.7465, "step": 920 }, { "epoch": 0.4170403587443946, "grad_norm": 1.3925457206301284, "learning_rate": 2.171449674305846e-06, "loss": 0.7427, "step": 930 }, { "epoch": 0.42152466367713004, "grad_norm": 1.4443502855856463, "learning_rate": 2.1503722605453083e-06, "loss": 0.7428, "step": 940 }, { "epoch": 0.4260089686098655, "grad_norm": 1.5268146365443709, "learning_rate": 2.1291354945559004e-06, "loss": 0.7163, "step": 950 }, { "epoch": 0.4304932735426009, "grad_norm": 1.5000325455240473, "learning_rate": 2.1077445797052945e-06, "loss": 0.7472, "step": 960 }, { "epoch": 0.4349775784753363, "grad_norm": 1.4869091852092478, "learning_rate": 2.086204757130243e-06, "loss": 0.7427, "step": 970 }, { "epoch": 0.43946188340807174, "grad_norm": 1.4430282256544564, "learning_rate": 2.0645213044524194e-06, "loss": 0.7174, "step": 980 }, { "epoch": 0.4439461883408072, "grad_norm": 1.4822025498870304, "learning_rate": 2.0426995344853043e-06, "loss": 0.7538, "step": 990 }, { "epoch": 0.4484304932735426, "grad_norm": 1.5186234240452396, "learning_rate": 2.0207447939324598e-06, "loss": 0.7243, "step": 1000 }, { "epoch": 0.4484304932735426, "eval_loss": 0.7356163859367371, "eval_runtime": 407.0139, "eval_samples_per_second": 123.035, "eval_steps_per_second": 1.924, "step": 1000 }, { "epoch": 0.452914798206278, "grad_norm": 1.5742685454152958, "learning_rate": 1.998662462077496e-06, "loss": 0.7475, "step": 1010 }, { "epoch": 0.45739910313901344, "grad_norm": 1.3834168469611057, "learning_rate": 1.976457949466054e-06, "loss": 0.7568, "step": 1020 }, { "epoch": 0.4618834080717489, "grad_norm": 1.4947961999330186, "learning_rate": 1.954136696580132e-06, "loss": 0.7464, "step": 1030 }, { "epoch": 0.4663677130044843, "grad_norm": 1.4284253764088304, "learning_rate": 1.9317041725050747e-06, "loss": 0.7456, "step": 1040 }, { "epoch": 0.47085201793721976, "grad_norm": 1.4247354157320633, "learning_rate": 1.909165873589554e-06, "loss": 0.7008, "step": 1050 }, { "epoch": 0.47533632286995514, "grad_norm": 1.4525308368306575, "learning_rate": 1.886527322098871e-06, "loss": 0.7121, "step": 1060 }, { "epoch": 0.4798206278026906, "grad_norm": 1.43738036112722, "learning_rate": 1.8637940648619065e-06, "loss": 0.7308, "step": 1070 }, { "epoch": 0.484304932735426, "grad_norm": 1.402086349899742, "learning_rate": 1.8409716719120561e-06, "loss": 0.7164, "step": 1080 }, { "epoch": 0.48878923766816146, "grad_norm": 1.5227358428935063, "learning_rate": 1.8180657351224739e-06, "loss": 0.732, "step": 1090 }, { "epoch": 0.49327354260089684, "grad_norm": 1.5813743714389112, "learning_rate": 1.7950818668359733e-06, "loss": 0.7161, "step": 1100 }, { "epoch": 0.49327354260089684, "eval_loss": 0.7330535054206848, "eval_runtime": 408.4081, "eval_samples_per_second": 122.615, "eval_steps_per_second": 1.917, "step": 1100 }, { "epoch": 0.4977578475336323, "grad_norm": 1.4881819590713468, "learning_rate": 1.772025698489903e-06, "loss": 0.7144, "step": 1110 }, { "epoch": 0.5022421524663677, "grad_norm": 1.4750319990458514, "learning_rate": 1.7489028792363549e-06, "loss": 0.7365, "step": 1120 }, { "epoch": 0.5067264573991032, "grad_norm": 1.4443590686278198, "learning_rate": 1.7257190745580209e-06, "loss": 0.7487, "step": 1130 }, { "epoch": 0.5112107623318386, "grad_norm": 1.4695293763109774, "learning_rate": 1.7024799648800555e-06, "loss": 0.7233, "step": 1140 }, { "epoch": 0.515695067264574, "grad_norm": 1.4328944860273993, "learning_rate": 1.679191244178278e-06, "loss": 0.7322, "step": 1150 }, { "epoch": 0.5201793721973094, "grad_norm": 1.4157130638413895, "learning_rate": 1.6558586185840473e-06, "loss": 0.728, "step": 1160 }, { "epoch": 0.5246636771300448, "grad_norm": 1.4117533616122613, "learning_rate": 1.6324878049861656e-06, "loss": 0.7331, "step": 1170 }, { "epoch": 0.5291479820627802, "grad_norm": 1.4255877674393056, "learning_rate": 1.609084529630145e-06, "loss": 0.7491, "step": 1180 }, { "epoch": 0.5336322869955157, "grad_norm": 1.4486300200418207, "learning_rate": 1.5856545267151759e-06, "loss": 0.7261, "step": 1190 }, { "epoch": 0.5381165919282511, "grad_norm": 1.4628618883782867, "learning_rate": 1.5622035369891561e-06, "loss": 0.7247, "step": 1200 }, { "epoch": 0.5381165919282511, "eval_loss": 0.7308038473129272, "eval_runtime": 406.6873, "eval_samples_per_second": 123.134, "eval_steps_per_second": 1.925, "step": 1200 }, { "epoch": 0.5426008968609866, "grad_norm": 1.4112256357672157, "learning_rate": 1.5387373063421062e-06, "loss": 0.7307, "step": 1210 }, { "epoch": 0.547085201793722, "grad_norm": 1.3994109954542429, "learning_rate": 1.515261584398333e-06, "loss": 0.7062, "step": 1220 }, { "epoch": 0.5515695067264574, "grad_norm": 1.5279436893984248, "learning_rate": 1.491782123107669e-06, "loss": 0.7314, "step": 1230 }, { "epoch": 0.5560538116591929, "grad_norm": 1.4092281762272858, "learning_rate": 1.4683046753361521e-06, "loss": 0.7044, "step": 1240 }, { "epoch": 0.5605381165919282, "grad_norm": 1.4363381867810665, "learning_rate": 1.4448349934564736e-06, "loss": 0.7287, "step": 1250 }, { "epoch": 0.5650224215246636, "grad_norm": 1.4913351223697051, "learning_rate": 1.421378827938549e-06, "loss": 0.7254, "step": 1260 }, { "epoch": 0.5695067264573991, "grad_norm": 1.5096384680619075, "learning_rate": 1.3979419259405563e-06, "loss": 0.7389, "step": 1270 }, { "epoch": 0.5739910313901345, "grad_norm": 1.3495144573299676, "learning_rate": 1.3745300299007856e-06, "loss": 0.7247, "step": 1280 }, { "epoch": 0.57847533632287, "grad_norm": 1.3641879848291365, "learning_rate": 1.3511488761306412e-06, "loss": 0.7312, "step": 1290 }, { "epoch": 0.5829596412556054, "grad_norm": 1.3879105033157129, "learning_rate": 1.3278041934091524e-06, "loss": 0.7477, "step": 1300 }, { "epoch": 0.5829596412556054, "eval_loss": 0.7287724018096924, "eval_runtime": 406.882, "eval_samples_per_second": 123.075, "eval_steps_per_second": 1.924, "step": 1300 }, { "epoch": 0.5874439461883408, "grad_norm": 1.3916697284582622, "learning_rate": 1.3045017015793217e-06, "loss": 0.7246, "step": 1310 }, { "epoch": 0.5919282511210763, "grad_norm": 1.4328511876779917, "learning_rate": 1.2812471101466687e-06, "loss": 0.7303, "step": 1320 }, { "epoch": 0.5964125560538116, "grad_norm": 1.4411092846252307, "learning_rate": 1.2580461168803038e-06, "loss": 0.7318, "step": 1330 }, { "epoch": 0.600896860986547, "grad_norm": 1.4703965551927338, "learning_rate": 1.2349044064168782e-06, "loss": 0.7375, "step": 1340 }, { "epoch": 0.6053811659192825, "grad_norm": 1.4319057117061509, "learning_rate": 1.21182764886775e-06, "loss": 0.7302, "step": 1350 }, { "epoch": 0.6098654708520179, "grad_norm": 1.5017976848926429, "learning_rate": 1.188821498429714e-06, "loss": 0.7262, "step": 1360 }, { "epoch": 0.6143497757847534, "grad_norm": 1.4553869576056546, "learning_rate": 1.165891591999626e-06, "loss": 0.7447, "step": 1370 }, { "epoch": 0.6188340807174888, "grad_norm": 1.4128744043127173, "learning_rate": 1.1430435477932646e-06, "loss": 0.7423, "step": 1380 }, { "epoch": 0.6233183856502242, "grad_norm": 1.3797159286061107, "learning_rate": 1.1202829639687785e-06, "loss": 0.744, "step": 1390 }, { "epoch": 0.6278026905829597, "grad_norm": 1.487304571595245, "learning_rate": 1.0976154172550408e-06, "loss": 0.7429, "step": 1400 }, { "epoch": 0.6278026905829597, "eval_loss": 0.7272571921348572, "eval_runtime": 406.7541, "eval_samples_per_second": 123.114, "eval_steps_per_second": 1.925, "step": 1400 }, { "epoch": 0.6322869955156951, "grad_norm": 1.544512062570189, "learning_rate": 1.0750464615852523e-06, "loss": 0.7251, "step": 1410 }, { "epoch": 0.6367713004484304, "grad_norm": 1.422563130817404, "learning_rate": 1.0525816267361398e-06, "loss": 0.712, "step": 1420 }, { "epoch": 0.6412556053811659, "grad_norm": 1.4937681764382644, "learning_rate": 1.0302264169730613e-06, "loss": 0.7203, "step": 1430 }, { "epoch": 0.6457399103139013, "grad_norm": 1.50738757049434, "learning_rate": 1.0079863097013722e-06, "loss": 0.7121, "step": 1440 }, { "epoch": 0.6502242152466368, "grad_norm": 1.286396172710849, "learning_rate": 9.85866754124367e-07, "loss": 0.7193, "step": 1450 }, { "epoch": 0.6547085201793722, "grad_norm": 1.4997539342741677, "learning_rate": 9.638731699081281e-07, "loss": 0.7288, "step": 1460 }, { "epoch": 0.6591928251121076, "grad_norm": 1.37434247409356, "learning_rate": 9.42010945853623e-07, "loss": 0.7597, "step": 1470 }, { "epoch": 0.6636771300448431, "grad_norm": 1.3869436283100607, "learning_rate": 9.202854385763502e-07, "loss": 0.7184, "step": 1480 }, { "epoch": 0.6681614349775785, "grad_norm": 1.3970067087387381, "learning_rate": 8.987019711938812e-07, "loss": 0.7326, "step": 1490 }, { "epoch": 0.672645739910314, "grad_norm": 1.553183464191494, "learning_rate": 8.772658320216047e-07, "loss": 0.7317, "step": 1500 }, { "epoch": 0.672645739910314, "eval_loss": 0.7256098389625549, "eval_runtime": 406.6132, "eval_samples_per_second": 123.156, "eval_steps_per_second": 1.926, "step": 1500 }, { "epoch": 0.6771300448430493, "grad_norm": 1.3357768297094936, "learning_rate": 8.55982273277002e-07, "loss": 0.7347, "step": 1510 }, { "epoch": 0.6816143497757847, "grad_norm": 1.3249788097985131, "learning_rate": 8.348565097927605e-07, "loss": 0.7496, "step": 1520 }, { "epoch": 0.6860986547085202, "grad_norm": 1.4578138220875878, "learning_rate": 8.13893717739056e-07, "loss": 0.7308, "step": 1530 }, { "epoch": 0.6905829596412556, "grad_norm": 1.3268077719441809, "learning_rate": 7.930990333553013e-07, "loss": 0.7094, "step": 1540 }, { "epoch": 0.695067264573991, "grad_norm": 1.47562182506043, "learning_rate": 7.72477551691678e-07, "loss": 0.697, "step": 1550 }, { "epoch": 0.6995515695067265, "grad_norm": 1.4850843190566259, "learning_rate": 7.520343253607677e-07, "loss": 0.7301, "step": 1560 }, { "epoch": 0.7040358744394619, "grad_norm": 1.5097763618083517, "learning_rate": 7.317743632995731e-07, "loss": 0.7217, "step": 1570 }, { "epoch": 0.7085201793721974, "grad_norm": 1.3914348509226637, "learning_rate": 7.117026295422425e-07, "loss": 0.6957, "step": 1580 }, { "epoch": 0.7130044843049327, "grad_norm": 1.5175208261545492, "learning_rate": 6.918240420038007e-07, "loss": 0.7317, "step": 1590 }, { "epoch": 0.7174887892376681, "grad_norm": 1.4947559578839034, "learning_rate": 6.721434712751745e-07, "loss": 0.7226, "step": 1600 }, { "epoch": 0.7174887892376681, "eval_loss": 0.7243176102638245, "eval_runtime": 406.7899, "eval_samples_per_second": 123.103, "eval_steps_per_second": 1.925, "step": 1600 }, { "epoch": 0.7219730941704036, "grad_norm": 1.5192098207309965, "learning_rate": 6.526657394298154e-07, "loss": 0.705, "step": 1610 }, { "epoch": 0.726457399103139, "grad_norm": 1.3665027387136646, "learning_rate": 6.333956188422088e-07, "loss": 0.706, "step": 1620 }, { "epoch": 0.7309417040358744, "grad_norm": 1.4974912840899435, "learning_rate": 6.143378310185643e-07, "loss": 0.6983, "step": 1630 }, { "epoch": 0.7354260089686099, "grad_norm": 1.5477574584643699, "learning_rate": 5.954970454399638e-07, "loss": 0.7252, "step": 1640 }, { "epoch": 0.7399103139013453, "grad_norm": 1.525090065151942, "learning_rate": 5.768778784182616e-07, "loss": 0.7087, "step": 1650 }, { "epoch": 0.7443946188340808, "grad_norm": 1.4837554579437873, "learning_rate": 5.584848919650069e-07, "loss": 0.7075, "step": 1660 }, { "epoch": 0.7488789237668162, "grad_norm": 1.3538329119260115, "learning_rate": 5.403225926736772e-07, "loss": 0.7057, "step": 1670 }, { "epoch": 0.7533632286995515, "grad_norm": 1.359895087573495, "learning_rate": 5.223954306154843e-07, "loss": 0.7306, "step": 1680 }, { "epoch": 0.757847533632287, "grad_norm": 1.4168148218595764, "learning_rate": 5.047077982490311e-07, "loss": 0.7424, "step": 1690 }, { "epoch": 0.7623318385650224, "grad_norm": 1.4815842671642683, "learning_rate": 4.872640293440861e-07, "loss": 0.695, "step": 1700 }, { "epoch": 0.7623318385650224, "eval_loss": 0.7233718633651733, "eval_runtime": 406.8015, "eval_samples_per_second": 123.099, "eval_steps_per_second": 1.925, "step": 1700 }, { "epoch": 0.7668161434977578, "grad_norm": 1.5501655544071418, "learning_rate": 4.7006839791973673e-07, "loss": 0.7327, "step": 1710 }, { "epoch": 0.7713004484304933, "grad_norm": 1.3834984705411, "learning_rate": 4.53125117197179e-07, "loss": 0.7245, "step": 1720 }, { "epoch": 0.7757847533632287, "grad_norm": 1.4041748328697374, "learning_rate": 4.364383385674112e-07, "loss": 0.7054, "step": 1730 }, { "epoch": 0.7802690582959642, "grad_norm": 1.443104622604103, "learning_rate": 4.2001215057407026e-07, "loss": 0.7037, "step": 1740 }, { "epoch": 0.7847533632286996, "grad_norm": 1.5632699202433824, "learning_rate": 4.038505779116687e-07, "loss": 0.705, "step": 1750 }, { "epoch": 0.7892376681614349, "grad_norm": 1.349615732583278, "learning_rate": 3.879575804394782e-07, "loss": 0.7071, "step": 1760 }, { "epoch": 0.7937219730941704, "grad_norm": 1.3657530768128234, "learning_rate": 3.7233705221129646e-07, "loss": 0.7273, "step": 1770 }, { "epoch": 0.7982062780269058, "grad_norm": 1.5107387856649341, "learning_rate": 3.569928205213354e-07, "loss": 0.6975, "step": 1780 }, { "epoch": 0.8026905829596412, "grad_norm": 1.4525568524987686, "learning_rate": 3.419286449664741e-07, "loss": 0.7095, "step": 1790 }, { "epoch": 0.8071748878923767, "grad_norm": 1.4847854049722584, "learning_rate": 3.2714821652508854e-07, "loss": 0.7167, "step": 1800 }, { "epoch": 0.8071748878923767, "eval_loss": 0.7225807309150696, "eval_runtime": 406.5326, "eval_samples_per_second": 123.181, "eval_steps_per_second": 1.926, "step": 1800 }, { "epoch": 0.8116591928251121, "grad_norm": 1.2447161837361285, "learning_rate": 3.126551566527036e-07, "loss": 0.7156, "step": 1810 }, { "epoch": 0.8161434977578476, "grad_norm": 1.4139333132454484, "learning_rate": 2.9845301639467284e-07, "loss": 0.7537, "step": 1820 }, { "epoch": 0.820627802690583, "grad_norm": 1.3663031642715642, "learning_rate": 2.8454527551611205e-07, "loss": 0.7238, "step": 1830 }, { "epoch": 0.8251121076233184, "grad_norm": 1.389263976301968, "learning_rate": 2.7093534164929904e-07, "loss": 0.738, "step": 1840 }, { "epoch": 0.8295964125560538, "grad_norm": 1.5068808968575202, "learning_rate": 2.576265494587458e-07, "loss": 0.7067, "step": 1850 }, { "epoch": 0.8340807174887892, "grad_norm": 1.4226178531466935, "learning_rate": 2.446221598241472e-07, "loss": 0.7143, "step": 1860 }, { "epoch": 0.8385650224215246, "grad_norm": 1.6881847148932905, "learning_rate": 2.319253590414132e-07, "loss": 0.7376, "step": 1870 }, { "epoch": 0.8430493273542601, "grad_norm": 1.4353283330892004, "learning_rate": 2.1953925804197056e-07, "loss": 0.7095, "step": 1880 }, { "epoch": 0.8475336322869955, "grad_norm": 1.4639605071750654, "learning_rate": 2.0746689163053113e-07, "loss": 0.7102, "step": 1890 }, { "epoch": 0.852017937219731, "grad_norm": 1.458703799588621, "learning_rate": 1.9571121774151545e-07, "loss": 0.686, "step": 1900 }, { "epoch": 0.852017937219731, "eval_loss": 0.7220604419708252, "eval_runtime": 406.5609, "eval_samples_per_second": 123.172, "eval_steps_per_second": 1.926, "step": 1900 }, { "epoch": 0.8565022421524664, "grad_norm": 1.470148783910905, "learning_rate": 1.8427511671430757e-07, "loss": 0.72, "step": 1910 }, { "epoch": 0.8609865470852018, "grad_norm": 1.3891242748262451, "learning_rate": 1.7316139058752194e-07, "loss": 0.7318, "step": 1920 }, { "epoch": 0.8654708520179372, "grad_norm": 1.2245069775705093, "learning_rate": 1.6237276241245867e-07, "loss": 0.7155, "step": 1930 }, { "epoch": 0.8699551569506726, "grad_norm": 1.360510189488915, "learning_rate": 1.519118755859084e-07, "loss": 0.7255, "step": 1940 }, { "epoch": 0.874439461883408, "grad_norm": 1.495119615923585, "learning_rate": 1.4178129320247486e-07, "loss": 0.7484, "step": 1950 }, { "epoch": 0.8789237668161435, "grad_norm": 1.3674856635367474, "learning_rate": 1.31983497426575e-07, "loss": 0.7366, "step": 1960 }, { "epoch": 0.8834080717488789, "grad_norm": 1.4494730150421093, "learning_rate": 1.2252088888426431e-07, "loss": 0.742, "step": 1970 }, { "epoch": 0.8878923766816144, "grad_norm": 1.4368197978682802, "learning_rate": 1.1339578607504536e-07, "loss": 0.7269, "step": 1980 }, { "epoch": 0.8923766816143498, "grad_norm": 1.4017197990051706, "learning_rate": 1.0461042480379402e-07, "loss": 0.7234, "step": 1990 }, { "epoch": 0.8968609865470852, "grad_norm": 1.426560347266084, "learning_rate": 9.616695763295007e-08, "loss": 0.7214, "step": 2000 }, { "epoch": 0.8968609865470852, "eval_loss": 0.721759557723999, "eval_runtime": 406.5838, "eval_samples_per_second": 123.165, "eval_steps_per_second": 1.926, "step": 2000 }, { "epoch": 0.9013452914798207, "grad_norm": 1.489947255967281, "learning_rate": 8.806745335510297e-08, "loss": 0.7341, "step": 2010 }, { "epoch": 0.905829596412556, "grad_norm": 1.4312716003053576, "learning_rate": 8.031389648610266e-08, "loss": 0.7264, "step": 2020 }, { "epoch": 0.9103139013452914, "grad_norm": 1.4764400641380824, "learning_rate": 7.290818677881966e-08, "loss": 0.7301, "step": 2030 }, { "epoch": 0.9147982062780269, "grad_norm": 1.4381108917682341, "learning_rate": 6.585213875767305e-08, "loss": 0.6997, "step": 2040 }, { "epoch": 0.9192825112107623, "grad_norm": 1.459723127188453, "learning_rate": 5.914748127404102e-08, "loss": 0.7168, "step": 2050 }, { "epoch": 0.9237668161434978, "grad_norm": 1.5776619173541433, "learning_rate": 5.2795857082663655e-08, "loss": 0.72, "step": 2060 }, { "epoch": 0.9282511210762332, "grad_norm": 1.438610611700907, "learning_rate": 4.6798822439140185e-08, "loss": 0.7035, "step": 2070 }, { "epoch": 0.9327354260089686, "grad_norm": 1.4350411032390504, "learning_rate": 4.115784671861916e-08, "loss": 0.735, "step": 2080 }, { "epoch": 0.9372197309417041, "grad_norm": 1.4822578142933729, "learning_rate": 3.587431205577713e-08, "loss": 0.7178, "step": 2090 }, { "epoch": 0.9417040358744395, "grad_norm": 1.5001233187138816, "learning_rate": 3.0949513006172325e-08, "loss": 0.7358, "step": 2100 }, { "epoch": 0.9417040358744395, "eval_loss": 0.7216091752052307, "eval_runtime": 406.6258, "eval_samples_per_second": 123.153, "eval_steps_per_second": 1.926, "step": 2100 }, { "epoch": 0.9461883408071748, "grad_norm": 1.4457564058059627, "learning_rate": 2.6384656229056946e-08, "loss": 0.7285, "step": 2110 }, { "epoch": 0.9506726457399103, "grad_norm": 1.6789172768348999, "learning_rate": 2.218086019172394e-08, "loss": 0.7027, "step": 2120 }, { "epoch": 0.9551569506726457, "grad_norm": 1.4039832008414181, "learning_rate": 1.8339154895464894e-08, "loss": 0.7285, "step": 2130 }, { "epoch": 0.9596412556053812, "grad_norm": 1.7674026844330886, "learning_rate": 1.4860481623201417e-08, "loss": 0.713, "step": 2140 }, { "epoch": 0.9641255605381166, "grad_norm": 1.531580121339593, "learning_rate": 1.1745692708855282e-08, "loss": 0.7328, "step": 2150 }, { "epoch": 0.968609865470852, "grad_norm": 1.455884868550825, "learning_rate": 8.99555132851232e-09, "loss": 0.7196, "step": 2160 }, { "epoch": 0.9730941704035875, "grad_norm": 1.3157536936429735, "learning_rate": 6.610731313430318e-09, "loss": 0.7277, "step": 2170 }, { "epoch": 0.9775784753363229, "grad_norm": 1.5586404477319191, "learning_rate": 4.5918169849406e-09, "loss": 0.7265, "step": 2180 }, { "epoch": 0.9820627802690582, "grad_norm": 1.3596393082767964, "learning_rate": 2.939303011277872e-09, "loss": 0.719, "step": 2190 }, { "epoch": 0.9865470852017937, "grad_norm": 1.3866642718972106, "learning_rate": 1.6535942863788456e-09, "loss": 0.7259, "step": 2200 }, { "epoch": 0.9865470852017937, "eval_loss": 0.7215752005577087, "eval_runtime": 408.9437, "eval_samples_per_second": 122.455, "eval_steps_per_second": 1.915, "step": 2200 }, { "epoch": 0.9910313901345291, "grad_norm": 1.6643780128489514, "learning_rate": 7.350058306764273e-10, "loss": 0.7044, "step": 2210 }, { "epoch": 0.9955156950672646, "grad_norm": 1.428221428067804, "learning_rate": 1.8376271391412624e-10, "loss": 0.7109, "step": 2220 }, { "epoch": 1.0, "grad_norm": 1.3882910125414851, "learning_rate": 0.0, "loss": 0.7123, "step": 2230 }, { "epoch": 1.0, "step": 2230, "total_flos": 250303561007104.0, "train_loss": 0.7492096503219262, "train_runtime": 18007.2993, "train_samples_per_second": 15.851, "train_steps_per_second": 0.124 } ], "logging_steps": 10, "max_steps": 2230, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 250303561007104.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }