{ "best_metric": 0.9622641509433962, "best_model_checkpoint": "wav2vec2-2Class-easy-train-test-large/checkpoint-2520", "epoch": 782.2222222222222, "eval_steps": 500, "global_step": 8800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.98, "eval_accuracy": 0.4088050314465409, "eval_loss": 0.7003181576728821, "eval_runtime": 1.8048, "eval_samples_per_second": 88.1, "eval_steps_per_second": 5.541, "step": 11 }, { "epoch": 1.96, "eval_accuracy": 0.4088050314465409, "eval_loss": 0.7001124620437622, "eval_runtime": 1.7728, "eval_samples_per_second": 89.69, "eval_steps_per_second": 5.641, "step": 22 }, { "epoch": 2.93, "eval_accuracy": 0.41509433962264153, "eval_loss": 0.69970703125, "eval_runtime": 1.7593, "eval_samples_per_second": 90.375, "eval_steps_per_second": 5.684, "step": 33 }, { "epoch": 4.0, "eval_accuracy": 0.42138364779874216, "eval_loss": 0.6991450786590576, "eval_runtime": 1.7582, "eval_samples_per_second": 90.433, "eval_steps_per_second": 5.688, "step": 45 }, { "epoch": 4.44, "grad_norm": 0.8353477716445923, "learning_rate": 1.7045454545454546e-06, "loss": 0.6976, "step": 50 }, { "epoch": 4.98, "eval_accuracy": 0.4276729559748428, "eval_loss": 0.6984724998474121, "eval_runtime": 1.7849, "eval_samples_per_second": 89.08, "eval_steps_per_second": 5.603, "step": 56 }, { "epoch": 5.96, "eval_accuracy": 0.44025157232704404, "eval_loss": 0.697744607925415, "eval_runtime": 2.127, "eval_samples_per_second": 74.753, "eval_steps_per_second": 4.701, "step": 67 }, { "epoch": 6.93, "eval_accuracy": 0.44654088050314467, "eval_loss": 0.6968724727630615, "eval_runtime": 2.2513, "eval_samples_per_second": 70.624, "eval_steps_per_second": 4.442, "step": 78 }, { "epoch": 8.0, "eval_accuracy": 0.46540880503144655, "eval_loss": 0.6957085728645325, "eval_runtime": 2.1194, "eval_samples_per_second": 75.021, "eval_steps_per_second": 4.718, "step": 90 }, { "epoch": 8.89, "grad_norm": 0.45805710554122925, "learning_rate": 3.409090909090909e-06, "loss": 0.6952, "step": 100 }, { "epoch": 8.98, "eval_accuracy": 0.46540880503144655, "eval_loss": 0.6945385932922363, "eval_runtime": 2.2918, "eval_samples_per_second": 69.378, "eval_steps_per_second": 4.363, "step": 101 }, { "epoch": 9.96, "eval_accuracy": 0.4779874213836478, "eval_loss": 0.6933900117874146, "eval_runtime": 2.2504, "eval_samples_per_second": 70.654, "eval_steps_per_second": 4.444, "step": 112 }, { "epoch": 10.93, "eval_accuracy": 0.49056603773584906, "eval_loss": 0.692146360874176, "eval_runtime": 2.1543, "eval_samples_per_second": 73.804, "eval_steps_per_second": 4.642, "step": 123 }, { "epoch": 12.0, "eval_accuracy": 0.5471698113207547, "eval_loss": 0.6906170845031738, "eval_runtime": 2.0832, "eval_samples_per_second": 76.326, "eval_steps_per_second": 4.8, "step": 135 }, { "epoch": 12.98, "eval_accuracy": 0.610062893081761, "eval_loss": 0.6892228722572327, "eval_runtime": 2.0269, "eval_samples_per_second": 78.443, "eval_steps_per_second": 4.934, "step": 146 }, { "epoch": 13.33, "grad_norm": 0.6493268609046936, "learning_rate": 5.1136363636363635e-06, "loss": 0.6911, "step": 150 }, { "epoch": 13.96, "eval_accuracy": 0.6037735849056604, "eval_loss": 0.6878040432929993, "eval_runtime": 2.1502, "eval_samples_per_second": 73.946, "eval_steps_per_second": 4.651, "step": 157 }, { "epoch": 14.93, "eval_accuracy": 0.5911949685534591, "eval_loss": 0.6863483190536499, "eval_runtime": 2.0844, "eval_samples_per_second": 76.279, "eval_steps_per_second": 4.797, "step": 168 }, { "epoch": 16.0, "eval_accuracy": 0.5911949685534591, "eval_loss": 0.6847361326217651, "eval_runtime": 2.1372, "eval_samples_per_second": 74.395, "eval_steps_per_second": 4.679, "step": 180 }, { "epoch": 16.98, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6830993294715881, "eval_runtime": 2.3473, "eval_samples_per_second": 67.739, "eval_steps_per_second": 4.26, "step": 191 }, { "epoch": 17.78, "grad_norm": 0.5862739086151123, "learning_rate": 6.818181818181818e-06, "loss": 0.6852, "step": 200 }, { "epoch": 17.96, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6815393567085266, "eval_runtime": 2.1307, "eval_samples_per_second": 74.623, "eval_steps_per_second": 4.693, "step": 202 }, { "epoch": 18.93, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.679994523525238, "eval_runtime": 2.082, "eval_samples_per_second": 76.37, "eval_steps_per_second": 4.803, "step": 213 }, { "epoch": 20.0, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6782289147377014, "eval_runtime": 2.1302, "eval_samples_per_second": 74.641, "eval_steps_per_second": 4.694, "step": 225 }, { "epoch": 20.98, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6765275001525879, "eval_runtime": 2.0229, "eval_samples_per_second": 78.601, "eval_steps_per_second": 4.943, "step": 236 }, { "epoch": 21.96, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6749551892280579, "eval_runtime": 2.0505, "eval_samples_per_second": 77.542, "eval_steps_per_second": 4.877, "step": 247 }, { "epoch": 22.22, "grad_norm": 0.10243403911590576, "learning_rate": 8.522727272727273e-06, "loss": 0.6783, "step": 250 }, { "epoch": 22.93, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6732170581817627, "eval_runtime": 2.0616, "eval_samples_per_second": 77.125, "eval_steps_per_second": 4.851, "step": 258 }, { "epoch": 24.0, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6713252067565918, "eval_runtime": 2.1605, "eval_samples_per_second": 73.595, "eval_steps_per_second": 4.629, "step": 270 }, { "epoch": 24.98, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6694673895835876, "eval_runtime": 2.0526, "eval_samples_per_second": 77.462, "eval_steps_per_second": 4.872, "step": 281 }, { "epoch": 25.96, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6674391031265259, "eval_runtime": 2.1284, "eval_samples_per_second": 74.704, "eval_steps_per_second": 4.698, "step": 292 }, { "epoch": 26.67, "grad_norm": 0.3114006221294403, "learning_rate": 1.0227272727272727e-05, "loss": 0.6676, "step": 300 }, { "epoch": 26.93, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6654335856437683, "eval_runtime": 1.9991, "eval_samples_per_second": 79.535, "eval_steps_per_second": 5.002, "step": 303 }, { "epoch": 28.0, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6630644202232361, "eval_runtime": 2.0451, "eval_samples_per_second": 77.745, "eval_steps_per_second": 4.89, "step": 315 }, { "epoch": 28.98, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6605831980705261, "eval_runtime": 2.0625, "eval_samples_per_second": 77.092, "eval_steps_per_second": 4.849, "step": 326 }, { "epoch": 29.96, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6578991413116455, "eval_runtime": 2.0381, "eval_samples_per_second": 78.014, "eval_steps_per_second": 4.907, "step": 337 }, { "epoch": 30.93, "eval_accuracy": 0.5849056603773585, "eval_loss": 0.6539114713668823, "eval_runtime": 1.9774, "eval_samples_per_second": 80.407, "eval_steps_per_second": 5.057, "step": 348 }, { "epoch": 31.11, "grad_norm": 0.2134709656238556, "learning_rate": 1.1931818181818181e-05, "loss": 0.6516, "step": 350 }, { "epoch": 32.0, "eval_accuracy": 0.5974842767295597, "eval_loss": 0.6492742896080017, "eval_runtime": 2.0601, "eval_samples_per_second": 77.182, "eval_steps_per_second": 4.854, "step": 360 }, { "epoch": 32.98, "eval_accuracy": 0.610062893081761, "eval_loss": 0.6441397070884705, "eval_runtime": 2.0739, "eval_samples_per_second": 76.667, "eval_steps_per_second": 4.822, "step": 371 }, { "epoch": 33.96, "eval_accuracy": 0.6226415094339622, "eval_loss": 0.6348815560340881, "eval_runtime": 2.1526, "eval_samples_per_second": 73.865, "eval_steps_per_second": 4.646, "step": 382 }, { "epoch": 34.93, "eval_accuracy": 0.6289308176100629, "eval_loss": 0.6257140040397644, "eval_runtime": 2.0081, "eval_samples_per_second": 79.179, "eval_steps_per_second": 4.98, "step": 393 }, { "epoch": 35.56, "grad_norm": 0.8974349498748779, "learning_rate": 1.3636363636363637e-05, "loss": 0.6124, "step": 400 }, { "epoch": 36.0, "eval_accuracy": 0.6415094339622641, "eval_loss": 0.611738920211792, "eval_runtime": 1.9854, "eval_samples_per_second": 80.083, "eval_steps_per_second": 5.037, "step": 405 }, { "epoch": 36.98, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.5910706520080566, "eval_runtime": 2.0618, "eval_samples_per_second": 77.117, "eval_steps_per_second": 4.85, "step": 416 }, { "epoch": 37.96, "eval_accuracy": 0.6918238993710691, "eval_loss": 0.5672016143798828, "eval_runtime": 2.0402, "eval_samples_per_second": 77.932, "eval_steps_per_second": 4.901, "step": 427 }, { "epoch": 38.93, "eval_accuracy": 0.7232704402515723, "eval_loss": 0.5392354130744934, "eval_runtime": 2.2936, "eval_samples_per_second": 69.324, "eval_steps_per_second": 4.36, "step": 438 }, { "epoch": 40.0, "grad_norm": 0.7736309170722961, "learning_rate": 1.534090909090909e-05, "loss": 0.5073, "step": 450 }, { "epoch": 40.0, "eval_accuracy": 0.7547169811320755, "eval_loss": 0.5041937232017517, "eval_runtime": 2.1247, "eval_samples_per_second": 74.835, "eval_steps_per_second": 4.707, "step": 450 }, { "epoch": 40.98, "eval_accuracy": 0.7672955974842768, "eval_loss": 0.47902750968933105, "eval_runtime": 2.163, "eval_samples_per_second": 73.509, "eval_steps_per_second": 4.623, "step": 461 }, { "epoch": 41.96, "eval_accuracy": 0.779874213836478, "eval_loss": 0.47594940662384033, "eval_runtime": 2.1321, "eval_samples_per_second": 74.574, "eval_steps_per_second": 4.69, "step": 472 }, { "epoch": 42.93, "eval_accuracy": 0.7987421383647799, "eval_loss": 0.4369964003562927, "eval_runtime": 2.1555, "eval_samples_per_second": 73.765, "eval_steps_per_second": 4.639, "step": 483 }, { "epoch": 44.0, "eval_accuracy": 0.7987421383647799, "eval_loss": 0.43516698479652405, "eval_runtime": 2.032, "eval_samples_per_second": 78.249, "eval_steps_per_second": 4.921, "step": 495 }, { "epoch": 44.44, "grad_norm": 0.4976819157600403, "learning_rate": 1.7045454545454546e-05, "loss": 0.3489, "step": 500 }, { "epoch": 44.98, "eval_accuracy": 0.7987421383647799, "eval_loss": 0.4422326385974884, "eval_runtime": 2.1135, "eval_samples_per_second": 75.231, "eval_steps_per_second": 4.732, "step": 506 }, { "epoch": 45.96, "eval_accuracy": 0.8050314465408805, "eval_loss": 0.41540881991386414, "eval_runtime": 2.0847, "eval_samples_per_second": 76.27, "eval_steps_per_second": 4.797, "step": 517 }, { "epoch": 46.93, "eval_accuracy": 0.8050314465408805, "eval_loss": 0.4131433367729187, "eval_runtime": 1.9752, "eval_samples_per_second": 80.498, "eval_steps_per_second": 5.063, "step": 528 }, { "epoch": 48.0, "eval_accuracy": 0.8113207547169812, "eval_loss": 0.3975575864315033, "eval_runtime": 2.01, "eval_samples_per_second": 79.104, "eval_steps_per_second": 4.975, "step": 540 }, { "epoch": 48.89, "grad_norm": 0.5197520852088928, "learning_rate": 1.8750000000000002e-05, "loss": 0.2962, "step": 550 }, { "epoch": 48.98, "eval_accuracy": 0.8113207547169812, "eval_loss": 0.39397454261779785, "eval_runtime": 2.0261, "eval_samples_per_second": 78.474, "eval_steps_per_second": 4.935, "step": 551 }, { "epoch": 49.96, "eval_accuracy": 0.8238993710691824, "eval_loss": 0.371494859457016, "eval_runtime": 2.0246, "eval_samples_per_second": 78.535, "eval_steps_per_second": 4.939, "step": 562 }, { "epoch": 50.93, "eval_accuracy": 0.8427672955974843, "eval_loss": 0.34951409697532654, "eval_runtime": 2.3286, "eval_samples_per_second": 68.281, "eval_steps_per_second": 4.294, "step": 573 }, { "epoch": 52.0, "eval_accuracy": 0.8364779874213837, "eval_loss": 0.3481156826019287, "eval_runtime": 1.9542, "eval_samples_per_second": 81.362, "eval_steps_per_second": 5.117, "step": 585 }, { "epoch": 52.98, "eval_accuracy": 0.8176100628930818, "eval_loss": 0.3817409873008728, "eval_runtime": 2.0789, "eval_samples_per_second": 76.484, "eval_steps_per_second": 4.81, "step": 596 }, { "epoch": 53.33, "grad_norm": 0.5608111023902893, "learning_rate": 2.0454545454545454e-05, "loss": 0.2573, "step": 600 }, { "epoch": 53.96, "eval_accuracy": 0.8490566037735849, "eval_loss": 0.3412492871284485, "eval_runtime": 2.0746, "eval_samples_per_second": 76.642, "eval_steps_per_second": 4.82, "step": 607 }, { "epoch": 54.93, "eval_accuracy": 0.8490566037735849, "eval_loss": 0.32929155230522156, "eval_runtime": 1.9991, "eval_samples_per_second": 79.538, "eval_steps_per_second": 5.002, "step": 618 }, { "epoch": 56.0, "eval_accuracy": 0.8427672955974843, "eval_loss": 0.3547687232494354, "eval_runtime": 2.1242, "eval_samples_per_second": 74.851, "eval_steps_per_second": 4.708, "step": 630 }, { "epoch": 56.98, "eval_accuracy": 0.8427672955974843, "eval_loss": 0.3044220209121704, "eval_runtime": 2.0508, "eval_samples_per_second": 77.532, "eval_steps_per_second": 4.876, "step": 641 }, { "epoch": 57.78, "grad_norm": 0.894092321395874, "learning_rate": 2.215909090909091e-05, "loss": 0.2279, "step": 650 }, { "epoch": 57.96, "eval_accuracy": 0.8490566037735849, "eval_loss": 0.32347577810287476, "eval_runtime": 2.2095, "eval_samples_per_second": 71.963, "eval_steps_per_second": 4.526, "step": 652 }, { "epoch": 58.93, "eval_accuracy": 0.8490566037735849, "eval_loss": 0.3371436297893524, "eval_runtime": 2.1055, "eval_samples_per_second": 75.518, "eval_steps_per_second": 4.75, "step": 663 }, { "epoch": 60.0, "eval_accuracy": 0.8490566037735849, "eval_loss": 0.31275492906570435, "eval_runtime": 2.1311, "eval_samples_per_second": 74.61, "eval_steps_per_second": 4.692, "step": 675 }, { "epoch": 60.98, "eval_accuracy": 0.8553459119496856, "eval_loss": 0.32111966609954834, "eval_runtime": 2.0639, "eval_samples_per_second": 77.038, "eval_steps_per_second": 4.845, "step": 686 }, { "epoch": 61.96, "eval_accuracy": 0.8616352201257862, "eval_loss": 0.302960604429245, "eval_runtime": 2.0241, "eval_samples_per_second": 78.552, "eval_steps_per_second": 4.94, "step": 697 }, { "epoch": 62.22, "grad_norm": 0.4315973222255707, "learning_rate": 2.3863636363636362e-05, "loss": 0.2167, "step": 700 }, { "epoch": 62.93, "eval_accuracy": 0.8616352201257862, "eval_loss": 0.29696550965309143, "eval_runtime": 2.034, "eval_samples_per_second": 78.169, "eval_steps_per_second": 4.916, "step": 708 }, { "epoch": 64.0, "eval_accuracy": 0.8679245283018868, "eval_loss": 0.29949402809143066, "eval_runtime": 2.095, "eval_samples_per_second": 75.897, "eval_steps_per_second": 4.773, "step": 720 }, { "epoch": 64.98, "eval_accuracy": 0.8742138364779874, "eval_loss": 0.2867083251476288, "eval_runtime": 2.0417, "eval_samples_per_second": 77.876, "eval_steps_per_second": 4.898, "step": 731 }, { "epoch": 65.96, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.26363295316696167, "eval_runtime": 2.1382, "eval_samples_per_second": 74.363, "eval_steps_per_second": 4.677, "step": 742 }, { "epoch": 66.67, "grad_norm": 0.37665870785713196, "learning_rate": 2.556818181818182e-05, "loss": 0.207, "step": 750 }, { "epoch": 66.93, "eval_accuracy": 0.8805031446540881, "eval_loss": 0.28482353687286377, "eval_runtime": 2.1166, "eval_samples_per_second": 75.119, "eval_steps_per_second": 4.724, "step": 753 }, { "epoch": 68.0, "eval_accuracy": 0.8867924528301887, "eval_loss": 0.2750767767429352, "eval_runtime": 2.1981, "eval_samples_per_second": 72.336, "eval_steps_per_second": 4.549, "step": 765 }, { "epoch": 68.98, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.256393700838089, "eval_runtime": 2.033, "eval_samples_per_second": 78.211, "eval_steps_per_second": 4.919, "step": 776 }, { "epoch": 69.96, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.25443732738494873, "eval_runtime": 2.0096, "eval_samples_per_second": 79.121, "eval_steps_per_second": 4.976, "step": 787 }, { "epoch": 70.93, "eval_accuracy": 0.8742138364779874, "eval_loss": 0.2954423129558563, "eval_runtime": 2.1018, "eval_samples_per_second": 75.649, "eval_steps_per_second": 4.758, "step": 798 }, { "epoch": 71.11, "grad_norm": 0.7302255630493164, "learning_rate": 2.7272727272727273e-05, "loss": 0.1899, "step": 800 }, { "epoch": 72.0, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.25169771909713745, "eval_runtime": 2.041, "eval_samples_per_second": 77.904, "eval_steps_per_second": 4.9, "step": 810 }, { "epoch": 72.98, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.2506076693534851, "eval_runtime": 2.0257, "eval_samples_per_second": 78.49, "eval_steps_per_second": 4.936, "step": 821 }, { "epoch": 73.96, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.2434261441230774, "eval_runtime": 2.0325, "eval_samples_per_second": 78.23, "eval_steps_per_second": 4.92, "step": 832 }, { "epoch": 74.93, "eval_accuracy": 0.89937106918239, "eval_loss": 0.23832084238529205, "eval_runtime": 2.1871, "eval_samples_per_second": 72.699, "eval_steps_per_second": 4.572, "step": 843 }, { "epoch": 75.56, "grad_norm": 0.5180615186691284, "learning_rate": 2.897727272727273e-05, "loss": 0.1801, "step": 850 }, { "epoch": 76.0, "eval_accuracy": 0.89937106918239, "eval_loss": 0.23464229702949524, "eval_runtime": 2.026, "eval_samples_per_second": 78.48, "eval_steps_per_second": 4.936, "step": 855 }, { "epoch": 76.98, "eval_accuracy": 0.89937106918239, "eval_loss": 0.22975026071071625, "eval_runtime": 2.0881, "eval_samples_per_second": 76.147, "eval_steps_per_second": 4.789, "step": 866 }, { "epoch": 77.96, "eval_accuracy": 0.9056603773584906, "eval_loss": 0.2403678596019745, "eval_runtime": 2.075, "eval_samples_per_second": 76.626, "eval_steps_per_second": 4.819, "step": 877 }, { "epoch": 78.93, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.2674010097980499, "eval_runtime": 2.037, "eval_samples_per_second": 78.057, "eval_steps_per_second": 4.909, "step": 888 }, { "epoch": 80.0, "grad_norm": 1.2135472297668457, "learning_rate": 2.9924242424242427e-05, "loss": 0.1692, "step": 900 }, { "epoch": 80.0, "eval_accuracy": 0.89937106918239, "eval_loss": 0.2231501042842865, "eval_runtime": 2.0398, "eval_samples_per_second": 77.949, "eval_steps_per_second": 4.902, "step": 900 }, { "epoch": 80.98, "eval_accuracy": 0.89937106918239, "eval_loss": 0.2390480935573578, "eval_runtime": 1.9822, "eval_samples_per_second": 80.213, "eval_steps_per_second": 5.045, "step": 911 }, { "epoch": 81.96, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.20583955943584442, "eval_runtime": 2.0665, "eval_samples_per_second": 76.94, "eval_steps_per_second": 4.839, "step": 922 }, { "epoch": 82.93, "eval_accuracy": 0.9056603773584906, "eval_loss": 0.2114023119211197, "eval_runtime": 2.0736, "eval_samples_per_second": 76.678, "eval_steps_per_second": 4.823, "step": 933 }, { "epoch": 84.0, "eval_accuracy": 0.89937106918239, "eval_loss": 0.24830691516399384, "eval_runtime": 2.0148, "eval_samples_per_second": 78.915, "eval_steps_per_second": 4.963, "step": 945 }, { "epoch": 84.44, "grad_norm": 0.5111488103866577, "learning_rate": 2.9734848484848486e-05, "loss": 0.1691, "step": 950 }, { "epoch": 84.98, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.2259017676115036, "eval_runtime": 2.2201, "eval_samples_per_second": 71.618, "eval_steps_per_second": 4.504, "step": 956 }, { "epoch": 85.96, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.20239894092082977, "eval_runtime": 2.0671, "eval_samples_per_second": 76.918, "eval_steps_per_second": 4.838, "step": 967 }, { "epoch": 86.93, "eval_accuracy": 0.89937106918239, "eval_loss": 0.20193150639533997, "eval_runtime": 2.0416, "eval_samples_per_second": 77.879, "eval_steps_per_second": 4.898, "step": 978 }, { "epoch": 88.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19625458121299744, "eval_runtime": 2.0196, "eval_samples_per_second": 78.73, "eval_steps_per_second": 4.952, "step": 990 }, { "epoch": 88.89, "grad_norm": 0.4683234989643097, "learning_rate": 2.9545454545454545e-05, "loss": 0.1609, "step": 1000 }, { "epoch": 88.98, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.21583892405033112, "eval_runtime": 2.0254, "eval_samples_per_second": 78.503, "eval_steps_per_second": 4.937, "step": 1001 }, { "epoch": 89.96, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.197691410779953, "eval_runtime": 1.9978, "eval_samples_per_second": 79.586, "eval_steps_per_second": 5.005, "step": 1012 }, { "epoch": 90.93, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.19791610538959503, "eval_runtime": 2.0853, "eval_samples_per_second": 76.248, "eval_steps_per_second": 4.795, "step": 1023 }, { "epoch": 92.0, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.20358721911907196, "eval_runtime": 2.1963, "eval_samples_per_second": 72.393, "eval_steps_per_second": 4.553, "step": 1035 }, { "epoch": 92.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19769711792469025, "eval_runtime": 2.0089, "eval_samples_per_second": 79.146, "eval_steps_per_second": 4.978, "step": 1046 }, { "epoch": 93.33, "grad_norm": 0.6099847555160522, "learning_rate": 2.9356060606060604e-05, "loss": 0.1516, "step": 1050 }, { "epoch": 93.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.1974458247423172, "eval_runtime": 2.1182, "eval_samples_per_second": 75.065, "eval_steps_per_second": 4.721, "step": 1057 }, { "epoch": 94.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.1993919163942337, "eval_runtime": 2.0707, "eval_samples_per_second": 76.787, "eval_steps_per_second": 4.829, "step": 1068 }, { "epoch": 96.0, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.1955273449420929, "eval_runtime": 2.0163, "eval_samples_per_second": 78.858, "eval_steps_per_second": 4.96, "step": 1080 }, { "epoch": 96.98, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.19483698904514313, "eval_runtime": 2.0495, "eval_samples_per_second": 77.581, "eval_steps_per_second": 4.879, "step": 1091 }, { "epoch": 97.78, "grad_norm": 1.0578981637954712, "learning_rate": 2.9166666666666666e-05, "loss": 0.1386, "step": 1100 }, { "epoch": 97.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19463855028152466, "eval_runtime": 2.0625, "eval_samples_per_second": 77.091, "eval_steps_per_second": 4.849, "step": 1102 }, { "epoch": 98.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19323910772800446, "eval_runtime": 2.0028, "eval_samples_per_second": 79.389, "eval_steps_per_second": 4.993, "step": 1113 }, { "epoch": 100.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.1841806173324585, "eval_runtime": 2.1056, "eval_samples_per_second": 75.512, "eval_steps_per_second": 4.749, "step": 1125 }, { "epoch": 100.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.18839451670646667, "eval_runtime": 1.9858, "eval_samples_per_second": 80.07, "eval_steps_per_second": 5.036, "step": 1136 }, { "epoch": 101.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.1899903267621994, "eval_runtime": 2.2196, "eval_samples_per_second": 71.635, "eval_steps_per_second": 4.505, "step": 1147 }, { "epoch": 102.22, "grad_norm": 0.6229210495948792, "learning_rate": 2.897727272727273e-05, "loss": 0.1279, "step": 1150 }, { "epoch": 102.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.184115469455719, "eval_runtime": 2.0229, "eval_samples_per_second": 78.602, "eval_steps_per_second": 4.944, "step": 1158 }, { "epoch": 104.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19207227230072021, "eval_runtime": 1.9639, "eval_samples_per_second": 80.962, "eval_steps_per_second": 5.092, "step": 1170 }, { "epoch": 104.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19926591217517853, "eval_runtime": 2.0509, "eval_samples_per_second": 77.526, "eval_steps_per_second": 4.876, "step": 1181 }, { "epoch": 105.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.19455212354660034, "eval_runtime": 2.0496, "eval_samples_per_second": 77.577, "eval_steps_per_second": 4.879, "step": 1192 }, { "epoch": 106.67, "grad_norm": 1.2741256952285767, "learning_rate": 2.8787878787878788e-05, "loss": 0.1258, "step": 1200 }, { "epoch": 106.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.18963727355003357, "eval_runtime": 2.0026, "eval_samples_per_second": 79.395, "eval_steps_per_second": 4.993, "step": 1203 }, { "epoch": 108.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.1884273737668991, "eval_runtime": 2.0343, "eval_samples_per_second": 78.16, "eval_steps_per_second": 4.916, "step": 1215 }, { "epoch": 108.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.17940251529216766, "eval_runtime": 2.1734, "eval_samples_per_second": 73.156, "eval_steps_per_second": 4.601, "step": 1226 }, { "epoch": 109.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.18589730560779572, "eval_runtime": 2.0874, "eval_samples_per_second": 76.17, "eval_steps_per_second": 4.791, "step": 1237 }, { "epoch": 110.93, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.2194768339395523, "eval_runtime": 2.0717, "eval_samples_per_second": 76.747, "eval_steps_per_second": 4.827, "step": 1248 }, { "epoch": 111.11, "grad_norm": 0.3613344430923462, "learning_rate": 2.859848484848485e-05, "loss": 0.1258, "step": 1250 }, { "epoch": 112.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.20826272666454315, "eval_runtime": 1.9861, "eval_samples_per_second": 80.057, "eval_steps_per_second": 5.035, "step": 1260 }, { "epoch": 112.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.21202689409255981, "eval_runtime": 2.0132, "eval_samples_per_second": 78.98, "eval_steps_per_second": 4.967, "step": 1271 }, { "epoch": 113.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.20663346350193024, "eval_runtime": 2.02, "eval_samples_per_second": 78.711, "eval_steps_per_second": 4.95, "step": 1282 }, { "epoch": 114.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.1931203156709671, "eval_runtime": 2.033, "eval_samples_per_second": 78.208, "eval_steps_per_second": 4.919, "step": 1293 }, { "epoch": 115.56, "grad_norm": 0.7503376007080078, "learning_rate": 2.8409090909090912e-05, "loss": 0.1023, "step": 1300 }, { "epoch": 116.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.19000084698200226, "eval_runtime": 2.0014, "eval_samples_per_second": 79.446, "eval_steps_per_second": 4.997, "step": 1305 }, { "epoch": 116.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.20288796722888947, "eval_runtime": 2.0774, "eval_samples_per_second": 76.539, "eval_steps_per_second": 4.814, "step": 1316 }, { "epoch": 117.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19505923986434937, "eval_runtime": 2.0552, "eval_samples_per_second": 77.366, "eval_steps_per_second": 4.866, "step": 1327 }, { "epoch": 118.93, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.20838169753551483, "eval_runtime": 2.2371, "eval_samples_per_second": 71.074, "eval_steps_per_second": 4.47, "step": 1338 }, { "epoch": 120.0, "grad_norm": 0.2376416176557541, "learning_rate": 2.821969696969697e-05, "loss": 0.0997, "step": 1350 }, { "epoch": 120.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.2159019112586975, "eval_runtime": 2.0579, "eval_samples_per_second": 77.264, "eval_steps_per_second": 4.859, "step": 1350 }, { "epoch": 120.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.21662545204162598, "eval_runtime": 2.0756, "eval_samples_per_second": 76.605, "eval_steps_per_second": 4.818, "step": 1361 }, { "epoch": 121.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.197323277592659, "eval_runtime": 2.0227, "eval_samples_per_second": 78.607, "eval_steps_per_second": 4.944, "step": 1372 }, { "epoch": 122.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.18507684767246246, "eval_runtime": 2.0728, "eval_samples_per_second": 76.706, "eval_steps_per_second": 4.824, "step": 1383 }, { "epoch": 124.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.20666691660881042, "eval_runtime": 1.9717, "eval_samples_per_second": 80.642, "eval_steps_per_second": 5.072, "step": 1395 }, { "epoch": 124.44, "grad_norm": 0.3115290403366089, "learning_rate": 2.803030303030303e-05, "loss": 0.1021, "step": 1400 }, { "epoch": 124.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.19534242153167725, "eval_runtime": 2.0497, "eval_samples_per_second": 77.571, "eval_steps_per_second": 4.879, "step": 1406 }, { "epoch": 125.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.17650572955608368, "eval_runtime": 2.239, "eval_samples_per_second": 71.015, "eval_steps_per_second": 4.466, "step": 1417 }, { "epoch": 126.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.18782062828540802, "eval_runtime": 2.0533, "eval_samples_per_second": 77.437, "eval_steps_per_second": 4.87, "step": 1428 }, { "epoch": 128.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.20708344876766205, "eval_runtime": 2.0414, "eval_samples_per_second": 77.887, "eval_steps_per_second": 4.899, "step": 1440 }, { "epoch": 128.89, "grad_norm": 1.2413551807403564, "learning_rate": 2.784090909090909e-05, "loss": 0.0883, "step": 1450 }, { "epoch": 128.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.2241077572107315, "eval_runtime": 1.9826, "eval_samples_per_second": 80.197, "eval_steps_per_second": 5.044, "step": 1451 }, { "epoch": 129.96, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.23481474816799164, "eval_runtime": 1.9747, "eval_samples_per_second": 80.518, "eval_steps_per_second": 5.064, "step": 1462 }, { "epoch": 130.93, "eval_accuracy": 0.9056603773584906, "eval_loss": 0.24748335778713226, "eval_runtime": 1.9737, "eval_samples_per_second": 80.559, "eval_steps_per_second": 5.067, "step": 1473 }, { "epoch": 132.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.21596243977546692, "eval_runtime": 2.0455, "eval_samples_per_second": 77.733, "eval_steps_per_second": 4.889, "step": 1485 }, { "epoch": 132.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.20896825194358826, "eval_runtime": 2.047, "eval_samples_per_second": 77.675, "eval_steps_per_second": 4.885, "step": 1496 }, { "epoch": 133.33, "grad_norm": 0.56540846824646, "learning_rate": 2.7651515151515152e-05, "loss": 0.0769, "step": 1500 }, { "epoch": 133.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.21468934416770935, "eval_runtime": 1.9936, "eval_samples_per_second": 79.754, "eval_steps_per_second": 5.016, "step": 1507 }, { "epoch": 134.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.22008037567138672, "eval_runtime": 2.0857, "eval_samples_per_second": 76.234, "eval_steps_per_second": 4.795, "step": 1518 }, { "epoch": 136.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.23723578453063965, "eval_runtime": 2.1872, "eval_samples_per_second": 72.695, "eval_steps_per_second": 4.572, "step": 1530 }, { "epoch": 136.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.21990692615509033, "eval_runtime": 2.0473, "eval_samples_per_second": 77.664, "eval_steps_per_second": 4.885, "step": 1541 }, { "epoch": 137.78, "grad_norm": 1.0245180130004883, "learning_rate": 2.7462121212121214e-05, "loss": 0.0786, "step": 1550 }, { "epoch": 137.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.2087443619966507, "eval_runtime": 2.0577, "eval_samples_per_second": 77.271, "eval_steps_per_second": 4.86, "step": 1552 }, { "epoch": 138.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.18779344856739044, "eval_runtime": 2.0799, "eval_samples_per_second": 76.447, "eval_steps_per_second": 4.808, "step": 1563 }, { "epoch": 140.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.1914655864238739, "eval_runtime": 2.043, "eval_samples_per_second": 77.827, "eval_steps_per_second": 4.895, "step": 1575 }, { "epoch": 140.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.23168283700942993, "eval_runtime": 2.0313, "eval_samples_per_second": 78.277, "eval_steps_per_second": 4.923, "step": 1586 }, { "epoch": 141.96, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.2865447700023651, "eval_runtime": 2.0095, "eval_samples_per_second": 79.125, "eval_steps_per_second": 4.976, "step": 1597 }, { "epoch": 142.22, "grad_norm": 1.393044352531433, "learning_rate": 2.7272727272727273e-05, "loss": 0.0714, "step": 1600 }, { "epoch": 142.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.22998519241809845, "eval_runtime": 2.1842, "eval_samples_per_second": 72.794, "eval_steps_per_second": 4.578, "step": 1608 }, { "epoch": 144.0, "eval_accuracy": 0.9056603773584906, "eval_loss": 0.27265357971191406, "eval_runtime": 2.0318, "eval_samples_per_second": 78.258, "eval_steps_per_second": 4.922, "step": 1620 }, { "epoch": 144.98, "eval_accuracy": 0.9056603773584906, "eval_loss": 0.28114742040634155, "eval_runtime": 2.0949, "eval_samples_per_second": 75.9, "eval_steps_per_second": 4.774, "step": 1631 }, { "epoch": 145.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.21014899015426636, "eval_runtime": 2.0829, "eval_samples_per_second": 76.335, "eval_steps_per_second": 4.801, "step": 1642 }, { "epoch": 146.67, "grad_norm": 1.1527929306030273, "learning_rate": 2.7083333333333335e-05, "loss": 0.0702, "step": 1650 }, { "epoch": 146.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.20363318920135498, "eval_runtime": 2.0224, "eval_samples_per_second": 78.618, "eval_steps_per_second": 4.945, "step": 1653 }, { "epoch": 148.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.22154641151428223, "eval_runtime": 2.0286, "eval_samples_per_second": 78.378, "eval_steps_per_second": 4.929, "step": 1665 }, { "epoch": 148.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.21356013417243958, "eval_runtime": 1.9745, "eval_samples_per_second": 80.526, "eval_steps_per_second": 5.065, "step": 1676 }, { "epoch": 149.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.20560431480407715, "eval_runtime": 2.0343, "eval_samples_per_second": 78.161, "eval_steps_per_second": 4.916, "step": 1687 }, { "epoch": 150.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.20028233528137207, "eval_runtime": 2.0476, "eval_samples_per_second": 77.65, "eval_steps_per_second": 4.884, "step": 1698 }, { "epoch": 151.11, "grad_norm": 0.6037131547927856, "learning_rate": 2.6893939393939398e-05, "loss": 0.0676, "step": 1700 }, { "epoch": 152.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.22495229542255402, "eval_runtime": 2.0653, "eval_samples_per_second": 76.985, "eval_steps_per_second": 4.842, "step": 1710 }, { "epoch": 152.98, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.1910940259695053, "eval_runtime": 2.2097, "eval_samples_per_second": 71.955, "eval_steps_per_second": 4.525, "step": 1721 }, { "epoch": 153.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.2189728170633316, "eval_runtime": 2.049, "eval_samples_per_second": 77.598, "eval_steps_per_second": 4.88, "step": 1732 }, { "epoch": 154.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.1975589245557785, "eval_runtime": 2.0536, "eval_samples_per_second": 77.426, "eval_steps_per_second": 4.87, "step": 1743 }, { "epoch": 155.56, "grad_norm": 0.9841188788414001, "learning_rate": 2.6704545454545453e-05, "loss": 0.0674, "step": 1750 }, { "epoch": 156.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.18743836879730225, "eval_runtime": 2.0593, "eval_samples_per_second": 77.211, "eval_steps_per_second": 4.856, "step": 1755 }, { "epoch": 156.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2022770792245865, "eval_runtime": 2.0432, "eval_samples_per_second": 77.821, "eval_steps_per_second": 4.894, "step": 1766 }, { "epoch": 157.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.21527531743049622, "eval_runtime": 1.9951, "eval_samples_per_second": 79.694, "eval_steps_per_second": 5.012, "step": 1777 }, { "epoch": 158.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.22451625764369965, "eval_runtime": 2.1442, "eval_samples_per_second": 74.155, "eval_steps_per_second": 4.664, "step": 1788 }, { "epoch": 160.0, "grad_norm": 0.5377254486083984, "learning_rate": 2.6515151515151516e-05, "loss": 0.0548, "step": 1800 }, { "epoch": 160.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.2431740015745163, "eval_runtime": 2.2699, "eval_samples_per_second": 70.046, "eval_steps_per_second": 4.405, "step": 1800 }, { "epoch": 160.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2071038782596588, "eval_runtime": 2.0506, "eval_samples_per_second": 77.538, "eval_steps_per_second": 4.877, "step": 1811 }, { "epoch": 161.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.18368059396743774, "eval_runtime": 2.2081, "eval_samples_per_second": 72.006, "eval_steps_per_second": 4.529, "step": 1822 }, { "epoch": 162.93, "eval_accuracy": 0.949685534591195, "eval_loss": 0.19161438941955566, "eval_runtime": 1.9999, "eval_samples_per_second": 79.505, "eval_steps_per_second": 5.0, "step": 1833 }, { "epoch": 164.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.22212089598178864, "eval_runtime": 2.0001, "eval_samples_per_second": 79.497, "eval_steps_per_second": 5.0, "step": 1845 }, { "epoch": 164.44, "grad_norm": 0.5433365702629089, "learning_rate": 2.6325757575757575e-05, "loss": 0.0616, "step": 1850 }, { "epoch": 164.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.21204246580600739, "eval_runtime": 2.035, "eval_samples_per_second": 78.132, "eval_steps_per_second": 4.914, "step": 1856 }, { "epoch": 165.96, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.18882697820663452, "eval_runtime": 2.0581, "eval_samples_per_second": 77.256, "eval_steps_per_second": 4.859, "step": 1867 }, { "epoch": 166.93, "eval_accuracy": 0.949685534591195, "eval_loss": 0.19714578986167908, "eval_runtime": 2.002, "eval_samples_per_second": 79.422, "eval_steps_per_second": 4.995, "step": 1878 }, { "epoch": 168.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.21613995730876923, "eval_runtime": 2.0979, "eval_samples_per_second": 75.789, "eval_steps_per_second": 4.767, "step": 1890 }, { "epoch": 168.89, "grad_norm": 0.4616011083126068, "learning_rate": 2.6136363636363637e-05, "loss": 0.0467, "step": 1900 }, { "epoch": 168.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.22824302315711975, "eval_runtime": 2.0023, "eval_samples_per_second": 79.407, "eval_steps_per_second": 4.994, "step": 1901 }, { "epoch": 169.96, "eval_accuracy": 0.9056603773584906, "eval_loss": 0.31181007623672485, "eval_runtime": 2.2272, "eval_samples_per_second": 71.39, "eval_steps_per_second": 4.49, "step": 1912 }, { "epoch": 170.93, "eval_accuracy": 0.949685534591195, "eval_loss": 0.23191651701927185, "eval_runtime": 2.0759, "eval_samples_per_second": 76.592, "eval_steps_per_second": 4.817, "step": 1923 }, { "epoch": 172.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.27404358983039856, "eval_runtime": 2.0769, "eval_samples_per_second": 76.555, "eval_steps_per_second": 4.815, "step": 1935 }, { "epoch": 172.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2666384279727936, "eval_runtime": 2.1046, "eval_samples_per_second": 75.548, "eval_steps_per_second": 4.751, "step": 1946 }, { "epoch": 173.33, "grad_norm": 1.0961925983428955, "learning_rate": 2.59469696969697e-05, "loss": 0.0609, "step": 1950 }, { "epoch": 173.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23152852058410645, "eval_runtime": 2.0323, "eval_samples_per_second": 78.237, "eval_steps_per_second": 4.921, "step": 1957 }, { "epoch": 174.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.22292692959308624, "eval_runtime": 2.0749, "eval_samples_per_second": 76.629, "eval_steps_per_second": 4.819, "step": 1968 }, { "epoch": 176.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.21578945219516754, "eval_runtime": 2.0472, "eval_samples_per_second": 77.668, "eval_steps_per_second": 4.885, "step": 1980 }, { "epoch": 176.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.22257991135120392, "eval_runtime": 2.1698, "eval_samples_per_second": 73.278, "eval_steps_per_second": 4.609, "step": 1991 }, { "epoch": 177.78, "grad_norm": 1.6022953987121582, "learning_rate": 2.575757575757576e-05, "loss": 0.0522, "step": 2000 }, { "epoch": 177.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.22241446375846863, "eval_runtime": 2.0341, "eval_samples_per_second": 78.167, "eval_steps_per_second": 4.916, "step": 2002 }, { "epoch": 178.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.21375904977321625, "eval_runtime": 2.1094, "eval_samples_per_second": 75.377, "eval_steps_per_second": 4.741, "step": 2013 }, { "epoch": 180.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.21769364178180695, "eval_runtime": 1.9898, "eval_samples_per_second": 79.909, "eval_steps_per_second": 5.026, "step": 2025 }, { "epoch": 180.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.19169649481773376, "eval_runtime": 2.1326, "eval_samples_per_second": 74.558, "eval_steps_per_second": 4.689, "step": 2036 }, { "epoch": 181.96, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.19741381704807281, "eval_runtime": 2.1931, "eval_samples_per_second": 72.5, "eval_steps_per_second": 4.56, "step": 2047 }, { "epoch": 182.22, "grad_norm": 0.7399430274963379, "learning_rate": 2.556818181818182e-05, "loss": 0.0515, "step": 2050 }, { "epoch": 182.93, "eval_accuracy": 0.949685534591195, "eval_loss": 0.21981187164783478, "eval_runtime": 2.0417, "eval_samples_per_second": 77.878, "eval_steps_per_second": 4.898, "step": 2058 }, { "epoch": 184.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.24247391521930695, "eval_runtime": 2.1999, "eval_samples_per_second": 72.278, "eval_steps_per_second": 4.546, "step": 2070 }, { "epoch": 184.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.24488882720470428, "eval_runtime": 2.0767, "eval_samples_per_second": 76.565, "eval_steps_per_second": 4.815, "step": 2081 }, { "epoch": 185.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.23463451862335205, "eval_runtime": 2.0674, "eval_samples_per_second": 76.907, "eval_steps_per_second": 4.837, "step": 2092 }, { "epoch": 186.67, "grad_norm": 0.67291659116745, "learning_rate": 2.5378787878787876e-05, "loss": 0.045, "step": 2100 }, { "epoch": 186.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23308323323726654, "eval_runtime": 2.2603, "eval_samples_per_second": 70.346, "eval_steps_per_second": 4.424, "step": 2103 }, { "epoch": 188.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2660614252090454, "eval_runtime": 2.0509, "eval_samples_per_second": 77.527, "eval_steps_per_second": 4.876, "step": 2115 }, { "epoch": 188.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.22910529375076294, "eval_runtime": 2.0536, "eval_samples_per_second": 77.423, "eval_steps_per_second": 4.869, "step": 2126 }, { "epoch": 189.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.23477251827716827, "eval_runtime": 2.0092, "eval_samples_per_second": 79.134, "eval_steps_per_second": 4.977, "step": 2137 }, { "epoch": 190.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23087622225284576, "eval_runtime": 2.0403, "eval_samples_per_second": 77.929, "eval_steps_per_second": 4.901, "step": 2148 }, { "epoch": 191.11, "grad_norm": 0.11660194396972656, "learning_rate": 2.518939393939394e-05, "loss": 0.0403, "step": 2150 }, { "epoch": 192.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.27889564633369446, "eval_runtime": 2.0147, "eval_samples_per_second": 78.921, "eval_steps_per_second": 4.964, "step": 2160 }, { "epoch": 192.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2540048658847809, "eval_runtime": 2.1082, "eval_samples_per_second": 75.42, "eval_steps_per_second": 4.743, "step": 2171 }, { "epoch": 193.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.23720349371433258, "eval_runtime": 2.1791, "eval_samples_per_second": 72.966, "eval_steps_per_second": 4.589, "step": 2182 }, { "epoch": 194.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2507873773574829, "eval_runtime": 1.986, "eval_samples_per_second": 80.061, "eval_steps_per_second": 5.035, "step": 2193 }, { "epoch": 195.56, "grad_norm": 0.8518453240394592, "learning_rate": 2.5e-05, "loss": 0.0476, "step": 2200 }, { "epoch": 196.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2193620353937149, "eval_runtime": 2.1819, "eval_samples_per_second": 72.874, "eval_steps_per_second": 4.583, "step": 2205 }, { "epoch": 196.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.23066306114196777, "eval_runtime": 2.0482, "eval_samples_per_second": 77.628, "eval_steps_per_second": 4.882, "step": 2216 }, { "epoch": 197.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2719472646713257, "eval_runtime": 1.9901, "eval_samples_per_second": 79.896, "eval_steps_per_second": 5.025, "step": 2227 }, { "epoch": 198.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.28040099143981934, "eval_runtime": 2.0617, "eval_samples_per_second": 77.122, "eval_steps_per_second": 4.85, "step": 2238 }, { "epoch": 200.0, "grad_norm": 0.09039253741502762, "learning_rate": 2.481060606060606e-05, "loss": 0.0457, "step": 2250 }, { "epoch": 200.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2755438983440399, "eval_runtime": 2.0773, "eval_samples_per_second": 76.541, "eval_steps_per_second": 4.814, "step": 2250 }, { "epoch": 200.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2353052794933319, "eval_runtime": 1.9899, "eval_samples_per_second": 79.904, "eval_steps_per_second": 5.025, "step": 2261 }, { "epoch": 201.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.21893078088760376, "eval_runtime": 2.1045, "eval_samples_per_second": 75.552, "eval_steps_per_second": 4.752, "step": 2272 }, { "epoch": 202.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.21625204384326935, "eval_runtime": 2.0731, "eval_samples_per_second": 76.697, "eval_steps_per_second": 4.824, "step": 2283 }, { "epoch": 204.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2110479772090912, "eval_runtime": 2.1463, "eval_samples_per_second": 74.079, "eval_steps_per_second": 4.659, "step": 2295 }, { "epoch": 204.44, "grad_norm": 0.9943685531616211, "learning_rate": 2.4621212121212123e-05, "loss": 0.0393, "step": 2300 }, { "epoch": 204.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23164410889148712, "eval_runtime": 2.0606, "eval_samples_per_second": 77.162, "eval_steps_per_second": 4.853, "step": 2306 }, { "epoch": 205.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.24650876224040985, "eval_runtime": 2.0011, "eval_samples_per_second": 79.455, "eval_steps_per_second": 4.997, "step": 2317 }, { "epoch": 206.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23763243854045868, "eval_runtime": 2.0999, "eval_samples_per_second": 75.719, "eval_steps_per_second": 4.762, "step": 2328 }, { "epoch": 208.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2170635461807251, "eval_runtime": 2.1575, "eval_samples_per_second": 73.697, "eval_steps_per_second": 4.635, "step": 2340 }, { "epoch": 208.89, "grad_norm": 0.46173095703125, "learning_rate": 2.4431818181818185e-05, "loss": 0.0443, "step": 2350 }, { "epoch": 208.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.23952844738960266, "eval_runtime": 2.0014, "eval_samples_per_second": 79.445, "eval_steps_per_second": 4.997, "step": 2351 }, { "epoch": 209.96, "eval_accuracy": 0.8930817610062893, "eval_loss": 0.2906019687652588, "eval_runtime": 2.0133, "eval_samples_per_second": 78.977, "eval_steps_per_second": 4.967, "step": 2362 }, { "epoch": 210.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2608316242694855, "eval_runtime": 2.1558, "eval_samples_per_second": 73.755, "eval_steps_per_second": 4.639, "step": 2373 }, { "epoch": 212.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.23210321366786957, "eval_runtime": 2.0606, "eval_samples_per_second": 77.161, "eval_steps_per_second": 4.853, "step": 2385 }, { "epoch": 212.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.24640053510665894, "eval_runtime": 2.2148, "eval_samples_per_second": 71.79, "eval_steps_per_second": 4.515, "step": 2396 }, { "epoch": 213.33, "grad_norm": 0.94215327501297, "learning_rate": 2.4242424242424244e-05, "loss": 0.0539, "step": 2400 }, { "epoch": 213.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.2441636025905609, "eval_runtime": 2.172, "eval_samples_per_second": 73.203, "eval_steps_per_second": 4.604, "step": 2407 }, { "epoch": 214.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.2511676847934723, "eval_runtime": 2.0176, "eval_samples_per_second": 78.806, "eval_steps_per_second": 4.956, "step": 2418 }, { "epoch": 216.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.22649481892585754, "eval_runtime": 2.0103, "eval_samples_per_second": 79.091, "eval_steps_per_second": 4.974, "step": 2430 }, { "epoch": 216.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.21274729073047638, "eval_runtime": 2.0508, "eval_samples_per_second": 77.529, "eval_steps_per_second": 4.876, "step": 2441 }, { "epoch": 217.78, "grad_norm": 0.7381362318992615, "learning_rate": 2.4053030303030303e-05, "loss": 0.0415, "step": 2450 }, { "epoch": 217.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.284365177154541, "eval_runtime": 2.0321, "eval_samples_per_second": 78.244, "eval_steps_per_second": 4.921, "step": 2452 }, { "epoch": 218.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.24891048669815063, "eval_runtime": 2.0843, "eval_samples_per_second": 76.285, "eval_steps_per_second": 4.798, "step": 2463 }, { "epoch": 220.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.21200108528137207, "eval_runtime": 1.9938, "eval_samples_per_second": 79.748, "eval_steps_per_second": 5.016, "step": 2475 }, { "epoch": 220.98, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.2015109807252884, "eval_runtime": 2.2098, "eval_samples_per_second": 71.951, "eval_steps_per_second": 4.525, "step": 2486 }, { "epoch": 221.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.25095799565315247, "eval_runtime": 2.0817, "eval_samples_per_second": 76.381, "eval_steps_per_second": 4.804, "step": 2497 }, { "epoch": 222.22, "grad_norm": 0.3756774961948395, "learning_rate": 2.3863636363636362e-05, "loss": 0.0325, "step": 2500 }, { "epoch": 222.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2875436246395111, "eval_runtime": 2.0148, "eval_samples_per_second": 78.915, "eval_steps_per_second": 4.963, "step": 2508 }, { "epoch": 224.0, "eval_accuracy": 0.9622641509433962, "eval_loss": 0.19936275482177734, "eval_runtime": 2.0208, "eval_samples_per_second": 78.682, "eval_steps_per_second": 4.949, "step": 2520 }, { "epoch": 224.98, "eval_accuracy": 0.9622641509433962, "eval_loss": 0.20330873131752014, "eval_runtime": 2.1708, "eval_samples_per_second": 73.243, "eval_steps_per_second": 4.606, "step": 2531 }, { "epoch": 225.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2391451746225357, "eval_runtime": 1.9988, "eval_samples_per_second": 79.549, "eval_steps_per_second": 5.003, "step": 2542 }, { "epoch": 226.67, "grad_norm": 0.6930297017097473, "learning_rate": 2.3674242424242424e-05, "loss": 0.0249, "step": 2550 }, { "epoch": 226.93, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.30440014600753784, "eval_runtime": 2.0166, "eval_samples_per_second": 78.847, "eval_steps_per_second": 4.959, "step": 2553 }, { "epoch": 228.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2825218439102173, "eval_runtime": 2.2235, "eval_samples_per_second": 71.51, "eval_steps_per_second": 4.497, "step": 2565 }, { "epoch": 228.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.234725683927536, "eval_runtime": 2.0151, "eval_samples_per_second": 78.905, "eval_steps_per_second": 4.963, "step": 2576 }, { "epoch": 229.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.24049904942512512, "eval_runtime": 2.0305, "eval_samples_per_second": 78.304, "eval_steps_per_second": 4.925, "step": 2587 }, { "epoch": 230.93, "eval_accuracy": 0.949685534591195, "eval_loss": 0.25367188453674316, "eval_runtime": 2.1765, "eval_samples_per_second": 73.054, "eval_steps_per_second": 4.595, "step": 2598 }, { "epoch": 231.11, "grad_norm": 0.8203662037849426, "learning_rate": 2.3484848484848487e-05, "loss": 0.0358, "step": 2600 }, { "epoch": 232.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.27088040113449097, "eval_runtime": 2.0677, "eval_samples_per_second": 76.895, "eval_steps_per_second": 4.836, "step": 2610 }, { "epoch": 232.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2444712519645691, "eval_runtime": 2.123, "eval_samples_per_second": 74.893, "eval_steps_per_second": 4.71, "step": 2621 }, { "epoch": 233.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.24358882009983063, "eval_runtime": 2.0612, "eval_samples_per_second": 77.139, "eval_steps_per_second": 4.852, "step": 2632 }, { "epoch": 234.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.22266939282417297, "eval_runtime": 2.0145, "eval_samples_per_second": 78.929, "eval_steps_per_second": 4.964, "step": 2643 }, { "epoch": 235.56, "grad_norm": 0.7004448771476746, "learning_rate": 2.3295454545454546e-05, "loss": 0.0345, "step": 2650 }, { "epoch": 236.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.22081993520259857, "eval_runtime": 2.0852, "eval_samples_per_second": 76.252, "eval_steps_per_second": 4.796, "step": 2655 }, { "epoch": 236.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.22930140793323517, "eval_runtime": 2.038, "eval_samples_per_second": 78.019, "eval_steps_per_second": 4.907, "step": 2666 }, { "epoch": 237.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2159855216741562, "eval_runtime": 2.2011, "eval_samples_per_second": 72.236, "eval_steps_per_second": 4.543, "step": 2677 }, { "epoch": 238.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2085605412721634, "eval_runtime": 2.0845, "eval_samples_per_second": 76.277, "eval_steps_per_second": 4.797, "step": 2688 }, { "epoch": 240.0, "grad_norm": 1.642115592956543, "learning_rate": 2.3106060606060608e-05, "loss": 0.0339, "step": 2700 }, { "epoch": 240.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.26398828625679016, "eval_runtime": 1.9895, "eval_samples_per_second": 79.918, "eval_steps_per_second": 5.026, "step": 2700 }, { "epoch": 240.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2953893542289734, "eval_runtime": 2.0677, "eval_samples_per_second": 76.899, "eval_steps_per_second": 4.836, "step": 2711 }, { "epoch": 241.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2507174611091614, "eval_runtime": 2.1213, "eval_samples_per_second": 74.953, "eval_steps_per_second": 4.714, "step": 2722 }, { "epoch": 242.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.227327361702919, "eval_runtime": 1.9774, "eval_samples_per_second": 80.407, "eval_steps_per_second": 5.057, "step": 2733 }, { "epoch": 244.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.24215646088123322, "eval_runtime": 2.0297, "eval_samples_per_second": 78.336, "eval_steps_per_second": 4.927, "step": 2745 }, { "epoch": 244.44, "grad_norm": 1.2598336935043335, "learning_rate": 2.2916666666666667e-05, "loss": 0.0309, "step": 2750 }, { "epoch": 244.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2931080758571625, "eval_runtime": 2.1459, "eval_samples_per_second": 74.093, "eval_steps_per_second": 4.66, "step": 2756 }, { "epoch": 245.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2694746255874634, "eval_runtime": 2.0392, "eval_samples_per_second": 77.97, "eval_steps_per_second": 4.904, "step": 2767 }, { "epoch": 246.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.26456066966056824, "eval_runtime": 2.1011, "eval_samples_per_second": 75.673, "eval_steps_per_second": 4.759, "step": 2778 }, { "epoch": 248.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23147591948509216, "eval_runtime": 2.0349, "eval_samples_per_second": 78.135, "eval_steps_per_second": 4.914, "step": 2790 }, { "epoch": 248.89, "grad_norm": 1.3385041952133179, "learning_rate": 2.272727272727273e-05, "loss": 0.0301, "step": 2800 }, { "epoch": 248.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2269720882177353, "eval_runtime": 2.0267, "eval_samples_per_second": 78.453, "eval_steps_per_second": 4.934, "step": 2801 }, { "epoch": 249.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.244718536734581, "eval_runtime": 2.0507, "eval_samples_per_second": 77.533, "eval_steps_per_second": 4.876, "step": 2812 }, { "epoch": 250.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2586061358451843, "eval_runtime": 2.0836, "eval_samples_per_second": 76.312, "eval_steps_per_second": 4.799, "step": 2823 }, { "epoch": 252.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3038959503173828, "eval_runtime": 2.0093, "eval_samples_per_second": 79.132, "eval_steps_per_second": 4.977, "step": 2835 }, { "epoch": 252.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.27771249413490295, "eval_runtime": 2.1305, "eval_samples_per_second": 74.63, "eval_steps_per_second": 4.694, "step": 2846 }, { "epoch": 253.33, "grad_norm": 0.40545353293418884, "learning_rate": 2.2537878787878788e-05, "loss": 0.0335, "step": 2850 }, { "epoch": 253.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.256588876247406, "eval_runtime": 2.1001, "eval_samples_per_second": 75.709, "eval_steps_per_second": 4.762, "step": 2857 }, { "epoch": 254.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.26031869649887085, "eval_runtime": 2.2094, "eval_samples_per_second": 71.966, "eval_steps_per_second": 4.526, "step": 2868 }, { "epoch": 256.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.26985806226730347, "eval_runtime": 1.9916, "eval_samples_per_second": 79.835, "eval_steps_per_second": 5.021, "step": 2880 }, { "epoch": 256.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2838137149810791, "eval_runtime": 1.9992, "eval_samples_per_second": 79.532, "eval_steps_per_second": 5.002, "step": 2891 }, { "epoch": 257.78, "grad_norm": 0.2661449611186981, "learning_rate": 2.2348484848484847e-05, "loss": 0.0249, "step": 2900 }, { "epoch": 257.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2572626769542694, "eval_runtime": 2.0448, "eval_samples_per_second": 77.758, "eval_steps_per_second": 4.89, "step": 2902 }, { "epoch": 258.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2652382254600525, "eval_runtime": 2.0483, "eval_samples_per_second": 77.627, "eval_steps_per_second": 4.882, "step": 2913 }, { "epoch": 260.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.26221606135368347, "eval_runtime": 1.9761, "eval_samples_per_second": 80.461, "eval_steps_per_second": 5.06, "step": 2925 }, { "epoch": 260.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2583387494087219, "eval_runtime": 2.0285, "eval_samples_per_second": 78.384, "eval_steps_per_second": 4.93, "step": 2936 }, { "epoch": 261.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.23241400718688965, "eval_runtime": 2.1753, "eval_samples_per_second": 73.094, "eval_steps_per_second": 4.597, "step": 2947 }, { "epoch": 262.22, "grad_norm": 0.5177292227745056, "learning_rate": 2.215909090909091e-05, "loss": 0.0308, "step": 2950 }, { "epoch": 262.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2781696319580078, "eval_runtime": 2.0731, "eval_samples_per_second": 76.695, "eval_steps_per_second": 4.824, "step": 2958 }, { "epoch": 264.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2519301474094391, "eval_runtime": 2.1326, "eval_samples_per_second": 74.556, "eval_steps_per_second": 4.689, "step": 2970 }, { "epoch": 264.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2634475529193878, "eval_runtime": 2.0868, "eval_samples_per_second": 76.194, "eval_steps_per_second": 4.792, "step": 2981 }, { "epoch": 265.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2647358775138855, "eval_runtime": 2.023, "eval_samples_per_second": 78.596, "eval_steps_per_second": 4.943, "step": 2992 }, { "epoch": 266.67, "grad_norm": 0.311382532119751, "learning_rate": 2.1969696969696972e-05, "loss": 0.0282, "step": 3000 }, { "epoch": 266.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.25880536437034607, "eval_runtime": 2.0166, "eval_samples_per_second": 78.845, "eval_steps_per_second": 4.959, "step": 3003 }, { "epoch": 268.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.23151087760925293, "eval_runtime": 2.1955, "eval_samples_per_second": 72.42, "eval_steps_per_second": 4.555, "step": 3015 }, { "epoch": 268.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.22928977012634277, "eval_runtime": 2.1352, "eval_samples_per_second": 74.465, "eval_steps_per_second": 4.683, "step": 3026 }, { "epoch": 269.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.23751527070999146, "eval_runtime": 2.0031, "eval_samples_per_second": 79.378, "eval_steps_per_second": 4.992, "step": 3037 }, { "epoch": 270.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.24385805428028107, "eval_runtime": 2.109, "eval_samples_per_second": 75.392, "eval_steps_per_second": 4.742, "step": 3048 }, { "epoch": 271.11, "grad_norm": 0.8252888321876526, "learning_rate": 2.178030303030303e-05, "loss": 0.0347, "step": 3050 }, { "epoch": 272.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2541854679584503, "eval_runtime": 2.1279, "eval_samples_per_second": 74.722, "eval_steps_per_second": 4.699, "step": 3060 }, { "epoch": 272.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.24015812575817108, "eval_runtime": 1.9697, "eval_samples_per_second": 80.724, "eval_steps_per_second": 5.077, "step": 3071 }, { "epoch": 273.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2365039885044098, "eval_runtime": 2.1369, "eval_samples_per_second": 74.406, "eval_steps_per_second": 4.68, "step": 3082 }, { "epoch": 274.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2757132053375244, "eval_runtime": 2.0094, "eval_samples_per_second": 79.128, "eval_steps_per_second": 4.977, "step": 3093 }, { "epoch": 275.56, "grad_norm": 0.06441498547792435, "learning_rate": 2.1590909090909093e-05, "loss": 0.0211, "step": 3100 }, { "epoch": 276.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.25078749656677246, "eval_runtime": 2.0059, "eval_samples_per_second": 79.266, "eval_steps_per_second": 4.985, "step": 3105 }, { "epoch": 276.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.23951387405395508, "eval_runtime": 2.174, "eval_samples_per_second": 73.137, "eval_steps_per_second": 4.6, "step": 3116 }, { "epoch": 277.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.25363460183143616, "eval_runtime": 2.0281, "eval_samples_per_second": 78.399, "eval_steps_per_second": 4.931, "step": 3127 }, { "epoch": 278.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.26847586035728455, "eval_runtime": 2.2802, "eval_samples_per_second": 69.729, "eval_steps_per_second": 4.385, "step": 3138 }, { "epoch": 280.0, "grad_norm": 0.5554720759391785, "learning_rate": 2.1401515151515152e-05, "loss": 0.0248, "step": 3150 }, { "epoch": 280.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.2974900007247925, "eval_runtime": 2.0423, "eval_samples_per_second": 77.852, "eval_steps_per_second": 4.896, "step": 3150 }, { "epoch": 280.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3234010636806488, "eval_runtime": 2.0793, "eval_samples_per_second": 76.469, "eval_steps_per_second": 4.809, "step": 3161 }, { "epoch": 281.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2707124352455139, "eval_runtime": 2.0919, "eval_samples_per_second": 76.007, "eval_steps_per_second": 4.78, "step": 3172 }, { "epoch": 282.93, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.22501063346862793, "eval_runtime": 1.9726, "eval_samples_per_second": 80.606, "eval_steps_per_second": 5.07, "step": 3183 }, { "epoch": 284.0, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.23188871145248413, "eval_runtime": 1.9745, "eval_samples_per_second": 80.526, "eval_steps_per_second": 5.065, "step": 3195 }, { "epoch": 284.44, "grad_norm": 0.20468498766422272, "learning_rate": 2.121212121212121e-05, "loss": 0.0243, "step": 3200 }, { "epoch": 284.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.25254714488983154, "eval_runtime": 2.1319, "eval_samples_per_second": 74.582, "eval_steps_per_second": 4.691, "step": 3206 }, { "epoch": 285.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.26610061526298523, "eval_runtime": 2.0326, "eval_samples_per_second": 78.226, "eval_steps_per_second": 4.92, "step": 3217 }, { "epoch": 286.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.28444719314575195, "eval_runtime": 2.0467, "eval_samples_per_second": 77.687, "eval_steps_per_second": 4.886, "step": 3228 }, { "epoch": 288.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2571127116680145, "eval_runtime": 2.1631, "eval_samples_per_second": 73.504, "eval_steps_per_second": 4.623, "step": 3240 }, { "epoch": 288.89, "grad_norm": 1.0598843097686768, "learning_rate": 2.1022727272727274e-05, "loss": 0.0223, "step": 3250 }, { "epoch": 288.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.251703679561615, "eval_runtime": 2.09, "eval_samples_per_second": 76.075, "eval_steps_per_second": 4.785, "step": 3251 }, { "epoch": 289.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2636191248893738, "eval_runtime": 2.0348, "eval_samples_per_second": 78.14, "eval_steps_per_second": 4.914, "step": 3262 }, { "epoch": 290.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.26941102743148804, "eval_runtime": 2.0598, "eval_samples_per_second": 77.193, "eval_steps_per_second": 4.855, "step": 3273 }, { "epoch": 292.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.23060773313045502, "eval_runtime": 2.0528, "eval_samples_per_second": 77.454, "eval_steps_per_second": 4.871, "step": 3285 }, { "epoch": 292.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.23769862949848175, "eval_runtime": 2.0936, "eval_samples_per_second": 75.945, "eval_steps_per_second": 4.776, "step": 3296 }, { "epoch": 293.33, "grad_norm": 0.6022414565086365, "learning_rate": 2.0833333333333333e-05, "loss": 0.0234, "step": 3300 }, { "epoch": 293.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.26981261372566223, "eval_runtime": 2.0959, "eval_samples_per_second": 75.861, "eval_steps_per_second": 4.771, "step": 3307 }, { "epoch": 294.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.28393277525901794, "eval_runtime": 2.0125, "eval_samples_per_second": 79.007, "eval_steps_per_second": 4.969, "step": 3318 }, { "epoch": 296.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.25016099214553833, "eval_runtime": 2.1941, "eval_samples_per_second": 72.467, "eval_steps_per_second": 4.558, "step": 3330 }, { "epoch": 296.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.27042049169540405, "eval_runtime": 2.0192, "eval_samples_per_second": 78.742, "eval_steps_per_second": 4.952, "step": 3341 }, { "epoch": 297.78, "grad_norm": 0.03581221029162407, "learning_rate": 2.0643939393939395e-05, "loss": 0.0256, "step": 3350 }, { "epoch": 297.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.28789857029914856, "eval_runtime": 2.1148, "eval_samples_per_second": 75.183, "eval_steps_per_second": 4.729, "step": 3352 }, { "epoch": 298.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3078269362449646, "eval_runtime": 2.0062, "eval_samples_per_second": 79.253, "eval_steps_per_second": 4.984, "step": 3363 }, { "epoch": 300.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31602492928504944, "eval_runtime": 2.0641, "eval_samples_per_second": 77.031, "eval_steps_per_second": 4.845, "step": 3375 }, { "epoch": 300.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2705954313278198, "eval_runtime": 2.0316, "eval_samples_per_second": 78.263, "eval_steps_per_second": 4.922, "step": 3386 }, { "epoch": 301.96, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2504004240036011, "eval_runtime": 2.1492, "eval_samples_per_second": 73.982, "eval_steps_per_second": 4.653, "step": 3397 }, { "epoch": 302.22, "grad_norm": 2.553766965866089, "learning_rate": 2.0454545454545454e-05, "loss": 0.0224, "step": 3400 }, { "epoch": 302.93, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.24540336430072784, "eval_runtime": 2.0269, "eval_samples_per_second": 78.443, "eval_steps_per_second": 4.934, "step": 3408 }, { "epoch": 304.0, "eval_accuracy": 0.9559748427672956, "eval_loss": 0.24798454344272614, "eval_runtime": 2.0863, "eval_samples_per_second": 76.213, "eval_steps_per_second": 4.793, "step": 3420 }, { "epoch": 304.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2511013150215149, "eval_runtime": 2.0476, "eval_samples_per_second": 77.651, "eval_steps_per_second": 4.884, "step": 3431 }, { "epoch": 305.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2796252369880676, "eval_runtime": 2.1539, "eval_samples_per_second": 73.819, "eval_steps_per_second": 4.643, "step": 3442 }, { "epoch": 306.67, "grad_norm": 0.41460466384887695, "learning_rate": 2.0265151515151516e-05, "loss": 0.0155, "step": 3450 }, { "epoch": 306.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.29322367906570435, "eval_runtime": 2.093, "eval_samples_per_second": 75.966, "eval_steps_per_second": 4.778, "step": 3453 }, { "epoch": 308.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.2996874153614044, "eval_runtime": 2.0951, "eval_samples_per_second": 75.893, "eval_steps_per_second": 4.773, "step": 3465 }, { "epoch": 308.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3044210970401764, "eval_runtime": 1.9749, "eval_samples_per_second": 80.512, "eval_steps_per_second": 5.064, "step": 3476 }, { "epoch": 309.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3255678415298462, "eval_runtime": 2.0175, "eval_samples_per_second": 78.81, "eval_steps_per_second": 4.957, "step": 3487 }, { "epoch": 310.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3400976359844208, "eval_runtime": 2.0285, "eval_samples_per_second": 78.381, "eval_steps_per_second": 4.93, "step": 3498 }, { "epoch": 311.11, "grad_norm": 0.5975369811058044, "learning_rate": 2.007575757575758e-05, "loss": 0.0226, "step": 3500 }, { "epoch": 312.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.30681127309799194, "eval_runtime": 2.0805, "eval_samples_per_second": 76.424, "eval_steps_per_second": 4.807, "step": 3510 }, { "epoch": 312.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.30169352889060974, "eval_runtime": 2.1998, "eval_samples_per_second": 72.279, "eval_steps_per_second": 4.546, "step": 3521 }, { "epoch": 313.96, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.29409661889076233, "eval_runtime": 2.1625, "eval_samples_per_second": 73.527, "eval_steps_per_second": 4.624, "step": 3532 }, { "epoch": 314.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2840117812156677, "eval_runtime": 2.0614, "eval_samples_per_second": 77.134, "eval_steps_per_second": 4.851, "step": 3543 }, { "epoch": 315.56, "grad_norm": 0.4768455922603607, "learning_rate": 1.9886363636363634e-05, "loss": 0.0153, "step": 3550 }, { "epoch": 316.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.28999558091163635, "eval_runtime": 2.0423, "eval_samples_per_second": 77.855, "eval_steps_per_second": 4.897, "step": 3555 }, { "epoch": 316.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.29232266545295715, "eval_runtime": 2.0108, "eval_samples_per_second": 79.073, "eval_steps_per_second": 4.973, "step": 3566 }, { "epoch": 317.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2964979112148285, "eval_runtime": 1.9633, "eval_samples_per_second": 80.988, "eval_steps_per_second": 5.094, "step": 3577 }, { "epoch": 318.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3201989531517029, "eval_runtime": 2.0683, "eval_samples_per_second": 76.876, "eval_steps_per_second": 4.835, "step": 3588 }, { "epoch": 320.0, "grad_norm": 0.01774447225034237, "learning_rate": 1.9696969696969697e-05, "loss": 0.0183, "step": 3600 }, { "epoch": 320.0, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.33252981305122375, "eval_runtime": 1.9991, "eval_samples_per_second": 79.534, "eval_steps_per_second": 5.002, "step": 3600 }, { "epoch": 320.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.34411394596099854, "eval_runtime": 1.9595, "eval_samples_per_second": 81.143, "eval_steps_per_second": 5.103, "step": 3611 }, { "epoch": 321.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3004206120967865, "eval_runtime": 2.102, "eval_samples_per_second": 75.644, "eval_steps_per_second": 4.757, "step": 3622 }, { "epoch": 322.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3022076487541199, "eval_runtime": 2.1248, "eval_samples_per_second": 74.83, "eval_steps_per_second": 4.706, "step": 3633 }, { "epoch": 324.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.29579004645347595, "eval_runtime": 2.073, "eval_samples_per_second": 76.702, "eval_steps_per_second": 4.824, "step": 3645 }, { "epoch": 324.44, "grad_norm": 0.43064549565315247, "learning_rate": 1.950757575757576e-05, "loss": 0.0257, "step": 3650 }, { "epoch": 324.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2942567765712738, "eval_runtime": 2.08, "eval_samples_per_second": 76.442, "eval_steps_per_second": 4.808, "step": 3656 }, { "epoch": 325.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2944892942905426, "eval_runtime": 1.9313, "eval_samples_per_second": 82.326, "eval_steps_per_second": 5.178, "step": 3667 }, { "epoch": 326.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.29099544882774353, "eval_runtime": 2.085, "eval_samples_per_second": 76.26, "eval_steps_per_second": 4.796, "step": 3678 }, { "epoch": 328.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.2856423258781433, "eval_runtime": 2.1029, "eval_samples_per_second": 75.609, "eval_steps_per_second": 4.755, "step": 3690 }, { "epoch": 328.89, "grad_norm": 0.7020539045333862, "learning_rate": 1.9318181818181818e-05, "loss": 0.0164, "step": 3700 }, { "epoch": 328.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.25798845291137695, "eval_runtime": 2.0372, "eval_samples_per_second": 78.047, "eval_steps_per_second": 4.909, "step": 3701 }, { "epoch": 329.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2566261291503906, "eval_runtime": 2.1479, "eval_samples_per_second": 74.027, "eval_steps_per_second": 4.656, "step": 3712 }, { "epoch": 330.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2538098394870758, "eval_runtime": 2.0665, "eval_samples_per_second": 76.941, "eval_steps_per_second": 4.839, "step": 3723 }, { "epoch": 332.0, "eval_accuracy": 0.949685534591195, "eval_loss": 0.24481499195098877, "eval_runtime": 2.0898, "eval_samples_per_second": 76.084, "eval_steps_per_second": 4.785, "step": 3735 }, { "epoch": 332.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2543666958808899, "eval_runtime": 2.035, "eval_samples_per_second": 78.134, "eval_steps_per_second": 4.914, "step": 3746 }, { "epoch": 333.33, "grad_norm": 0.9068632125854492, "learning_rate": 1.912878787878788e-05, "loss": 0.0222, "step": 3750 }, { "epoch": 333.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3075094223022461, "eval_runtime": 2.101, "eval_samples_per_second": 75.678, "eval_steps_per_second": 4.76, "step": 3757 }, { "epoch": 334.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.27574771642684937, "eval_runtime": 2.0253, "eval_samples_per_second": 78.507, "eval_steps_per_second": 4.938, "step": 3768 }, { "epoch": 336.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2714598774909973, "eval_runtime": 2.2265, "eval_samples_per_second": 71.412, "eval_steps_per_second": 4.491, "step": 3780 }, { "epoch": 336.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3330034911632538, "eval_runtime": 2.0552, "eval_samples_per_second": 77.365, "eval_steps_per_second": 4.866, "step": 3791 }, { "epoch": 337.78, "grad_norm": 0.03231671825051308, "learning_rate": 1.893939393939394e-05, "loss": 0.0212, "step": 3800 }, { "epoch": 337.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35598525404930115, "eval_runtime": 2.0188, "eval_samples_per_second": 78.762, "eval_steps_per_second": 4.954, "step": 3802 }, { "epoch": 338.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.28320637345314026, "eval_runtime": 2.1352, "eval_samples_per_second": 74.467, "eval_steps_per_second": 4.683, "step": 3813 }, { "epoch": 340.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2855217754840851, "eval_runtime": 2.1886, "eval_samples_per_second": 72.648, "eval_steps_per_second": 4.569, "step": 3825 }, { "epoch": 340.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.30631041526794434, "eval_runtime": 2.0061, "eval_samples_per_second": 79.26, "eval_steps_per_second": 4.985, "step": 3836 }, { "epoch": 341.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.29151424765586853, "eval_runtime": 2.0201, "eval_samples_per_second": 78.71, "eval_steps_per_second": 4.95, "step": 3847 }, { "epoch": 342.22, "grad_norm": 0.07481174916028976, "learning_rate": 1.8750000000000002e-05, "loss": 0.016, "step": 3850 }, { "epoch": 342.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.28358563780784607, "eval_runtime": 1.9309, "eval_samples_per_second": 82.344, "eval_steps_per_second": 5.179, "step": 3858 }, { "epoch": 344.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.28052231669425964, "eval_runtime": 1.9926, "eval_samples_per_second": 79.797, "eval_steps_per_second": 5.019, "step": 3870 }, { "epoch": 344.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.26776131987571716, "eval_runtime": 2.1218, "eval_samples_per_second": 74.936, "eval_steps_per_second": 4.713, "step": 3881 }, { "epoch": 345.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2839824855327606, "eval_runtime": 2.0764, "eval_samples_per_second": 76.575, "eval_steps_per_second": 4.816, "step": 3892 }, { "epoch": 346.67, "grad_norm": 1.4776334762573242, "learning_rate": 1.856060606060606e-05, "loss": 0.0163, "step": 3900 }, { "epoch": 346.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3048093914985657, "eval_runtime": 2.1233, "eval_samples_per_second": 74.885, "eval_steps_per_second": 4.71, "step": 3903 }, { "epoch": 348.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.27605798840522766, "eval_runtime": 1.9601, "eval_samples_per_second": 81.117, "eval_steps_per_second": 5.102, "step": 3915 }, { "epoch": 348.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.30447614192962646, "eval_runtime": 2.0457, "eval_samples_per_second": 77.724, "eval_steps_per_second": 4.888, "step": 3926 }, { "epoch": 349.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.26728910207748413, "eval_runtime": 2.0205, "eval_samples_per_second": 78.692, "eval_steps_per_second": 4.949, "step": 3937 }, { "epoch": 350.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2830033600330353, "eval_runtime": 2.0741, "eval_samples_per_second": 76.66, "eval_steps_per_second": 4.821, "step": 3948 }, { "epoch": 351.11, "grad_norm": 0.30603834986686707, "learning_rate": 1.837121212121212e-05, "loss": 0.0185, "step": 3950 }, { "epoch": 352.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31495675444602966, "eval_runtime": 2.0088, "eval_samples_per_second": 79.152, "eval_steps_per_second": 4.978, "step": 3960 }, { "epoch": 352.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2967083156108856, "eval_runtime": 2.0921, "eval_samples_per_second": 75.999, "eval_steps_per_second": 4.78, "step": 3971 }, { "epoch": 353.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2917640507221222, "eval_runtime": 2.1439, "eval_samples_per_second": 74.165, "eval_steps_per_second": 4.664, "step": 3982 }, { "epoch": 354.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2848517894744873, "eval_runtime": 2.0244, "eval_samples_per_second": 78.541, "eval_steps_per_second": 4.94, "step": 3993 }, { "epoch": 355.56, "grad_norm": 0.6905023455619812, "learning_rate": 1.8181818181818182e-05, "loss": 0.0189, "step": 4000 }, { "epoch": 356.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.28043246269226074, "eval_runtime": 2.0697, "eval_samples_per_second": 76.823, "eval_steps_per_second": 4.832, "step": 4005 }, { "epoch": 356.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.29090604186058044, "eval_runtime": 2.3048, "eval_samples_per_second": 68.987, "eval_steps_per_second": 4.339, "step": 4016 }, { "epoch": 357.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3029940724372864, "eval_runtime": 2.0213, "eval_samples_per_second": 78.661, "eval_steps_per_second": 4.947, "step": 4027 }, { "epoch": 358.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.316310852766037, "eval_runtime": 2.0126, "eval_samples_per_second": 79.004, "eval_steps_per_second": 4.969, "step": 4038 }, { "epoch": 360.0, "grad_norm": 0.09516480565071106, "learning_rate": 1.799242424242424e-05, "loss": 0.0153, "step": 4050 }, { "epoch": 360.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.32167917490005493, "eval_runtime": 1.9486, "eval_samples_per_second": 81.598, "eval_steps_per_second": 5.132, "step": 4050 }, { "epoch": 360.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3025132715702057, "eval_runtime": 2.0179, "eval_samples_per_second": 78.794, "eval_steps_per_second": 4.956, "step": 4061 }, { "epoch": 361.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.297443687915802, "eval_runtime": 1.9969, "eval_samples_per_second": 79.622, "eval_steps_per_second": 5.008, "step": 4072 }, { "epoch": 362.93, "eval_accuracy": 0.949685534591195, "eval_loss": 0.28664350509643555, "eval_runtime": 2.0131, "eval_samples_per_second": 78.984, "eval_steps_per_second": 4.968, "step": 4083 }, { "epoch": 364.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.32455363869667053, "eval_runtime": 2.1216, "eval_samples_per_second": 74.943, "eval_steps_per_second": 4.713, "step": 4095 }, { "epoch": 364.44, "grad_norm": 0.14960724115371704, "learning_rate": 1.7803030303030303e-05, "loss": 0.0169, "step": 4100 }, { "epoch": 364.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2801210880279541, "eval_runtime": 1.87, "eval_samples_per_second": 85.025, "eval_steps_per_second": 5.347, "step": 4106 }, { "epoch": 365.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.31326618790626526, "eval_runtime": 1.8975, "eval_samples_per_second": 83.793, "eval_steps_per_second": 5.27, "step": 4117 }, { "epoch": 366.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3283620774745941, "eval_runtime": 1.8154, "eval_samples_per_second": 87.585, "eval_steps_per_second": 5.509, "step": 4128 }, { "epoch": 368.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2716998755931854, "eval_runtime": 1.7785, "eval_samples_per_second": 89.401, "eval_steps_per_second": 5.623, "step": 4140 }, { "epoch": 368.89, "grad_norm": 1.529534935951233, "learning_rate": 1.7613636363636366e-05, "loss": 0.0207, "step": 4150 }, { "epoch": 368.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.26920509338378906, "eval_runtime": 1.777, "eval_samples_per_second": 89.477, "eval_steps_per_second": 5.627, "step": 4151 }, { "epoch": 369.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2673673927783966, "eval_runtime": 1.8105, "eval_samples_per_second": 87.819, "eval_steps_per_second": 5.523, "step": 4162 }, { "epoch": 370.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.26433154940605164, "eval_runtime": 1.8098, "eval_samples_per_second": 87.857, "eval_steps_per_second": 5.526, "step": 4173 }, { "epoch": 372.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2969939410686493, "eval_runtime": 1.7874, "eval_samples_per_second": 88.954, "eval_steps_per_second": 5.595, "step": 4185 }, { "epoch": 372.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2687932550907135, "eval_runtime": 1.9292, "eval_samples_per_second": 82.418, "eval_steps_per_second": 5.184, "step": 4196 }, { "epoch": 373.33, "grad_norm": 0.41630563139915466, "learning_rate": 1.7424242424242425e-05, "loss": 0.0213, "step": 4200 }, { "epoch": 373.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2765069603919983, "eval_runtime": 1.9392, "eval_samples_per_second": 81.994, "eval_steps_per_second": 5.157, "step": 4207 }, { "epoch": 374.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.28704383969306946, "eval_runtime": 1.8427, "eval_samples_per_second": 86.287, "eval_steps_per_second": 5.427, "step": 4218 }, { "epoch": 376.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.30059266090393066, "eval_runtime": 1.8146, "eval_samples_per_second": 87.624, "eval_steps_per_second": 5.511, "step": 4230 }, { "epoch": 376.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2943706512451172, "eval_runtime": 1.7941, "eval_samples_per_second": 88.625, "eval_steps_per_second": 5.574, "step": 4241 }, { "epoch": 377.78, "grad_norm": 1.3894481658935547, "learning_rate": 1.7234848484848487e-05, "loss": 0.02, "step": 4250 }, { "epoch": 377.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3019978106021881, "eval_runtime": 1.8046, "eval_samples_per_second": 88.107, "eval_steps_per_second": 5.541, "step": 4252 }, { "epoch": 378.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3074227571487427, "eval_runtime": 1.7835, "eval_samples_per_second": 89.152, "eval_steps_per_second": 5.607, "step": 4263 }, { "epoch": 380.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.29427269101142883, "eval_runtime": 1.8177, "eval_samples_per_second": 87.473, "eval_steps_per_second": 5.501, "step": 4275 }, { "epoch": 380.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.2825266420841217, "eval_runtime": 1.8911, "eval_samples_per_second": 84.077, "eval_steps_per_second": 5.288, "step": 4286 }, { "epoch": 381.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2760971188545227, "eval_runtime": 1.9521, "eval_samples_per_second": 81.451, "eval_steps_per_second": 5.123, "step": 4297 }, { "epoch": 382.22, "grad_norm": 0.021462175995111465, "learning_rate": 1.7045454545454546e-05, "loss": 0.0143, "step": 4300 }, { "epoch": 382.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.29204800724983215, "eval_runtime": 1.9261, "eval_samples_per_second": 82.551, "eval_steps_per_second": 5.192, "step": 4308 }, { "epoch": 384.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.29515865445137024, "eval_runtime": 1.8478, "eval_samples_per_second": 86.046, "eval_steps_per_second": 5.412, "step": 4320 }, { "epoch": 384.98, "eval_accuracy": 0.949685534591195, "eval_loss": 0.3164711594581604, "eval_runtime": 1.7929, "eval_samples_per_second": 88.684, "eval_steps_per_second": 5.578, "step": 4331 }, { "epoch": 385.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2803152799606323, "eval_runtime": 1.8039, "eval_samples_per_second": 88.141, "eval_steps_per_second": 5.543, "step": 4342 }, { "epoch": 386.67, "grad_norm": 0.4159376621246338, "learning_rate": 1.6856060606060605e-05, "loss": 0.0196, "step": 4350 }, { "epoch": 386.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.28756093978881836, "eval_runtime": 1.7845, "eval_samples_per_second": 89.1, "eval_steps_per_second": 5.604, "step": 4353 }, { "epoch": 388.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2759377956390381, "eval_runtime": 1.8441, "eval_samples_per_second": 86.221, "eval_steps_per_second": 5.423, "step": 4365 }, { "epoch": 388.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2701479494571686, "eval_runtime": 1.7826, "eval_samples_per_second": 89.198, "eval_steps_per_second": 5.61, "step": 4376 }, { "epoch": 389.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2951464354991913, "eval_runtime": 1.9039, "eval_samples_per_second": 83.514, "eval_steps_per_second": 5.252, "step": 4387 }, { "epoch": 390.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2950435280799866, "eval_runtime": 1.8731, "eval_samples_per_second": 84.885, "eval_steps_per_second": 5.339, "step": 4398 }, { "epoch": 391.11, "grad_norm": 0.057938866317272186, "learning_rate": 1.6670454545454544e-05, "loss": 0.0234, "step": 4400 }, { "epoch": 392.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.29603102803230286, "eval_runtime": 1.9831, "eval_samples_per_second": 80.176, "eval_steps_per_second": 5.043, "step": 4410 }, { "epoch": 392.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3337320387363434, "eval_runtime": 1.847, "eval_samples_per_second": 86.084, "eval_steps_per_second": 5.414, "step": 4421 }, { "epoch": 393.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.33828112483024597, "eval_runtime": 1.8496, "eval_samples_per_second": 85.964, "eval_steps_per_second": 5.407, "step": 4432 }, { "epoch": 394.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3078320026397705, "eval_runtime": 1.8258, "eval_samples_per_second": 87.084, "eval_steps_per_second": 5.477, "step": 4443 }, { "epoch": 395.56, "grad_norm": 0.39662787318229675, "learning_rate": 1.6481060606060606e-05, "loss": 0.0161, "step": 4450 }, { "epoch": 396.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3138676881790161, "eval_runtime": 1.7627, "eval_samples_per_second": 90.205, "eval_steps_per_second": 5.673, "step": 4455 }, { "epoch": 396.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31875431537628174, "eval_runtime": 1.7584, "eval_samples_per_second": 90.422, "eval_steps_per_second": 5.687, "step": 4466 }, { "epoch": 397.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3307281732559204, "eval_runtime": 1.7976, "eval_samples_per_second": 88.452, "eval_steps_per_second": 5.563, "step": 4477 }, { "epoch": 398.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31634414196014404, "eval_runtime": 1.8551, "eval_samples_per_second": 85.711, "eval_steps_per_second": 5.391, "step": 4488 }, { "epoch": 400.0, "grad_norm": 0.7240819931030273, "learning_rate": 1.6291666666666665e-05, "loss": 0.0162, "step": 4500 }, { "epoch": 400.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3018243908882141, "eval_runtime": 1.9085, "eval_samples_per_second": 83.313, "eval_steps_per_second": 5.24, "step": 4500 }, { "epoch": 400.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2813258469104767, "eval_runtime": 2.0304, "eval_samples_per_second": 78.308, "eval_steps_per_second": 4.925, "step": 4511 }, { "epoch": 401.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3019176125526428, "eval_runtime": 1.8259, "eval_samples_per_second": 87.08, "eval_steps_per_second": 5.477, "step": 4522 }, { "epoch": 402.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.28099265694618225, "eval_runtime": 1.7238, "eval_samples_per_second": 92.239, "eval_steps_per_second": 5.801, "step": 4533 }, { "epoch": 404.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2745566666126251, "eval_runtime": 1.7857, "eval_samples_per_second": 89.039, "eval_steps_per_second": 5.6, "step": 4545 }, { "epoch": 404.44, "grad_norm": 0.8649039268493652, "learning_rate": 1.6102272727272727e-05, "loss": 0.023, "step": 4550 }, { "epoch": 404.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2850847542285919, "eval_runtime": 1.8274, "eval_samples_per_second": 87.011, "eval_steps_per_second": 5.472, "step": 4556 }, { "epoch": 405.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.31582126021385193, "eval_runtime": 1.742, "eval_samples_per_second": 91.274, "eval_steps_per_second": 5.741, "step": 4567 }, { "epoch": 406.93, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.34668126702308655, "eval_runtime": 1.8815, "eval_samples_per_second": 84.506, "eval_steps_per_second": 5.315, "step": 4578 }, { "epoch": 408.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.34958958625793457, "eval_runtime": 2.0856, "eval_samples_per_second": 76.236, "eval_steps_per_second": 4.795, "step": 4590 }, { "epoch": 408.89, "grad_norm": 1.8184185028076172, "learning_rate": 1.591287878787879e-05, "loss": 0.0164, "step": 4600 }, { "epoch": 408.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.33241006731987, "eval_runtime": 2.1278, "eval_samples_per_second": 74.727, "eval_steps_per_second": 4.7, "step": 4601 }, { "epoch": 409.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32462239265441895, "eval_runtime": 2.221, "eval_samples_per_second": 71.589, "eval_steps_per_second": 4.502, "step": 4612 }, { "epoch": 410.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3765309154987335, "eval_runtime": 2.0273, "eval_samples_per_second": 78.43, "eval_steps_per_second": 4.933, "step": 4623 }, { "epoch": 412.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3543161451816559, "eval_runtime": 2.0351, "eval_samples_per_second": 78.129, "eval_steps_per_second": 4.914, "step": 4635 }, { "epoch": 412.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3280029594898224, "eval_runtime": 2.1541, "eval_samples_per_second": 73.813, "eval_steps_per_second": 4.642, "step": 4646 }, { "epoch": 413.33, "grad_norm": 1.7262401580810547, "learning_rate": 1.572348484848485e-05, "loss": 0.0189, "step": 4650 }, { "epoch": 413.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.30754944682121277, "eval_runtime": 1.987, "eval_samples_per_second": 80.018, "eval_steps_per_second": 5.033, "step": 4657 }, { "epoch": 414.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3012823462486267, "eval_runtime": 2.084, "eval_samples_per_second": 76.297, "eval_steps_per_second": 4.799, "step": 4668 }, { "epoch": 416.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3047963082790375, "eval_runtime": 2.1147, "eval_samples_per_second": 75.187, "eval_steps_per_second": 4.729, "step": 4680 }, { "epoch": 416.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.297464519739151, "eval_runtime": 2.0439, "eval_samples_per_second": 77.791, "eval_steps_per_second": 4.893, "step": 4691 }, { "epoch": 417.78, "grad_norm": 0.03005032427608967, "learning_rate": 1.553409090909091e-05, "loss": 0.018, "step": 4700 }, { "epoch": 417.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.30111947655677795, "eval_runtime": 2.0823, "eval_samples_per_second": 76.356, "eval_steps_per_second": 4.802, "step": 4702 }, { "epoch": 418.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3059113621711731, "eval_runtime": 2.0164, "eval_samples_per_second": 78.853, "eval_steps_per_second": 4.959, "step": 4713 }, { "epoch": 420.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3002815544605255, "eval_runtime": 2.0599, "eval_samples_per_second": 77.187, "eval_steps_per_second": 4.855, "step": 4725 }, { "epoch": 420.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.2898853123188019, "eval_runtime": 2.1653, "eval_samples_per_second": 73.43, "eval_steps_per_second": 4.618, "step": 4736 }, { "epoch": 421.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.27394920587539673, "eval_runtime": 1.976, "eval_samples_per_second": 80.464, "eval_steps_per_second": 5.061, "step": 4747 }, { "epoch": 422.22, "grad_norm": 0.05734672769904137, "learning_rate": 1.534469696969697e-05, "loss": 0.014, "step": 4750 }, { "epoch": 422.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.28232210874557495, "eval_runtime": 2.0336, "eval_samples_per_second": 78.186, "eval_steps_per_second": 4.917, "step": 4758 }, { "epoch": 424.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3002234697341919, "eval_runtime": 2.1015, "eval_samples_per_second": 75.661, "eval_steps_per_second": 4.759, "step": 4770 }, { "epoch": 424.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31039535999298096, "eval_runtime": 2.0591, "eval_samples_per_second": 77.218, "eval_steps_per_second": 4.856, "step": 4781 }, { "epoch": 425.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2992786467075348, "eval_runtime": 2.219, "eval_samples_per_second": 71.652, "eval_steps_per_second": 4.506, "step": 4792 }, { "epoch": 426.67, "grad_norm": 0.20316560566425323, "learning_rate": 1.5155303030303031e-05, "loss": 0.0161, "step": 4800 }, { "epoch": 426.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.28384503722190857, "eval_runtime": 2.3212, "eval_samples_per_second": 68.5, "eval_steps_per_second": 4.308, "step": 4803 }, { "epoch": 428.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.303459495306015, "eval_runtime": 2.0531, "eval_samples_per_second": 77.442, "eval_steps_per_second": 4.871, "step": 4815 }, { "epoch": 428.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31719303131103516, "eval_runtime": 2.0034, "eval_samples_per_second": 79.365, "eval_steps_per_second": 4.992, "step": 4826 }, { "epoch": 429.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2884739935398102, "eval_runtime": 2.1854, "eval_samples_per_second": 72.756, "eval_steps_per_second": 4.576, "step": 4837 }, { "epoch": 430.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2915368676185608, "eval_runtime": 2.0672, "eval_samples_per_second": 76.914, "eval_steps_per_second": 4.837, "step": 4848 }, { "epoch": 431.11, "grad_norm": 0.1926555037498474, "learning_rate": 1.496590909090909e-05, "loss": 0.0181, "step": 4850 }, { "epoch": 432.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32380226254463196, "eval_runtime": 2.0107, "eval_samples_per_second": 79.076, "eval_steps_per_second": 4.973, "step": 4860 }, { "epoch": 432.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3051411807537079, "eval_runtime": 2.0979, "eval_samples_per_second": 75.789, "eval_steps_per_second": 4.767, "step": 4871 }, { "epoch": 433.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2746570408344269, "eval_runtime": 2.0978, "eval_samples_per_second": 75.795, "eval_steps_per_second": 4.767, "step": 4882 }, { "epoch": 434.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.27779048681259155, "eval_runtime": 2.1681, "eval_samples_per_second": 73.336, "eval_steps_per_second": 4.612, "step": 4893 }, { "epoch": 435.56, "grad_norm": 0.2639506757259369, "learning_rate": 1.4776515151515152e-05, "loss": 0.0152, "step": 4900 }, { "epoch": 436.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3142688274383545, "eval_runtime": 2.0074, "eval_samples_per_second": 79.208, "eval_steps_per_second": 4.982, "step": 4905 }, { "epoch": 436.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.29534852504730225, "eval_runtime": 2.0119, "eval_samples_per_second": 79.031, "eval_steps_per_second": 4.97, "step": 4916 }, { "epoch": 437.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2987271249294281, "eval_runtime": 2.0466, "eval_samples_per_second": 77.691, "eval_steps_per_second": 4.886, "step": 4927 }, { "epoch": 438.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3240003287792206, "eval_runtime": 2.1303, "eval_samples_per_second": 74.638, "eval_steps_per_second": 4.694, "step": 4938 }, { "epoch": 440.0, "grad_norm": 1.0273933410644531, "learning_rate": 1.4587121212121213e-05, "loss": 0.0233, "step": 4950 }, { "epoch": 440.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2931964099407196, "eval_runtime": 2.0028, "eval_samples_per_second": 79.388, "eval_steps_per_second": 4.993, "step": 4950 }, { "epoch": 440.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.30667683482170105, "eval_runtime": 2.1028, "eval_samples_per_second": 75.614, "eval_steps_per_second": 4.756, "step": 4961 }, { "epoch": 441.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31695908308029175, "eval_runtime": 2.1429, "eval_samples_per_second": 74.198, "eval_steps_per_second": 4.667, "step": 4972 }, { "epoch": 442.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.33484575152397156, "eval_runtime": 2.2487, "eval_samples_per_second": 70.709, "eval_steps_per_second": 4.447, "step": 4983 }, { "epoch": 444.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3350779116153717, "eval_runtime": 2.2089, "eval_samples_per_second": 71.981, "eval_steps_per_second": 4.527, "step": 4995 }, { "epoch": 444.44, "grad_norm": 0.05571739375591278, "learning_rate": 1.4397727272727274e-05, "loss": 0.0134, "step": 5000 }, { "epoch": 444.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.33779439330101013, "eval_runtime": 2.155, "eval_samples_per_second": 73.781, "eval_steps_per_second": 4.64, "step": 5006 }, { "epoch": 445.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.32037729024887085, "eval_runtime": 2.1415, "eval_samples_per_second": 74.247, "eval_steps_per_second": 4.67, "step": 5017 }, { "epoch": 446.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.30960965156555176, "eval_runtime": 2.0664, "eval_samples_per_second": 76.947, "eval_steps_per_second": 4.839, "step": 5028 }, { "epoch": 448.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3135194480419159, "eval_runtime": 2.1609, "eval_samples_per_second": 73.581, "eval_steps_per_second": 4.628, "step": 5040 }, { "epoch": 448.89, "grad_norm": 1.2499555349349976, "learning_rate": 1.4208333333333333e-05, "loss": 0.0185, "step": 5050 }, { "epoch": 448.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32047778367996216, "eval_runtime": 2.0116, "eval_samples_per_second": 79.04, "eval_steps_per_second": 4.971, "step": 5051 }, { "epoch": 449.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3151703476905823, "eval_runtime": 1.9982, "eval_samples_per_second": 79.571, "eval_steps_per_second": 5.004, "step": 5062 }, { "epoch": 450.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.32720035314559937, "eval_runtime": 2.0554, "eval_samples_per_second": 77.357, "eval_steps_per_second": 4.865, "step": 5073 }, { "epoch": 452.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.31637299060821533, "eval_runtime": 2.0655, "eval_samples_per_second": 76.978, "eval_steps_per_second": 4.841, "step": 5085 }, { "epoch": 452.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3297300934791565, "eval_runtime": 2.0194, "eval_samples_per_second": 78.737, "eval_steps_per_second": 4.952, "step": 5096 }, { "epoch": 453.33, "grad_norm": 0.4623982012271881, "learning_rate": 1.4018939393939395e-05, "loss": 0.0149, "step": 5100 }, { "epoch": 453.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3299054801464081, "eval_runtime": 2.027, "eval_samples_per_second": 78.441, "eval_steps_per_second": 4.933, "step": 5107 }, { "epoch": 454.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34270188212394714, "eval_runtime": 2.0462, "eval_samples_per_second": 77.705, "eval_steps_per_second": 4.887, "step": 5118 }, { "epoch": 456.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3775523006916046, "eval_runtime": 2.0532, "eval_samples_per_second": 77.442, "eval_steps_per_second": 4.871, "step": 5130 }, { "epoch": 456.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.376447468996048, "eval_runtime": 2.0839, "eval_samples_per_second": 76.298, "eval_steps_per_second": 4.799, "step": 5141 }, { "epoch": 457.78, "grad_norm": 0.23284748196601868, "learning_rate": 1.3829545454545456e-05, "loss": 0.0099, "step": 5150 }, { "epoch": 457.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3852477967739105, "eval_runtime": 2.0765, "eval_samples_per_second": 76.569, "eval_steps_per_second": 4.816, "step": 5152 }, { "epoch": 458.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.35552406311035156, "eval_runtime": 2.0834, "eval_samples_per_second": 76.318, "eval_steps_per_second": 4.8, "step": 5163 }, { "epoch": 460.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3497180640697479, "eval_runtime": 2.1727, "eval_samples_per_second": 73.182, "eval_steps_per_second": 4.603, "step": 5175 }, { "epoch": 460.98, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.3959099054336548, "eval_runtime": 2.2063, "eval_samples_per_second": 72.066, "eval_steps_per_second": 4.532, "step": 5186 }, { "epoch": 461.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3428646922111511, "eval_runtime": 2.0667, "eval_samples_per_second": 76.934, "eval_steps_per_second": 4.839, "step": 5197 }, { "epoch": 462.22, "grad_norm": 0.01973637193441391, "learning_rate": 1.3640151515151516e-05, "loss": 0.0123, "step": 5200 }, { "epoch": 462.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3277600407600403, "eval_runtime": 2.0262, "eval_samples_per_second": 78.472, "eval_steps_per_second": 4.935, "step": 5208 }, { "epoch": 464.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.307450532913208, "eval_runtime": 2.1318, "eval_samples_per_second": 74.586, "eval_steps_per_second": 4.691, "step": 5220 }, { "epoch": 464.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.30191025137901306, "eval_runtime": 2.0236, "eval_samples_per_second": 78.574, "eval_steps_per_second": 4.942, "step": 5231 }, { "epoch": 465.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3069049119949341, "eval_runtime": 1.9794, "eval_samples_per_second": 80.326, "eval_steps_per_second": 5.052, "step": 5242 }, { "epoch": 466.67, "grad_norm": 1.7077068090438843, "learning_rate": 1.3450757575757575e-05, "loss": 0.0169, "step": 5250 }, { "epoch": 466.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3036327362060547, "eval_runtime": 2.2515, "eval_samples_per_second": 70.62, "eval_steps_per_second": 4.442, "step": 5253 }, { "epoch": 468.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32558977603912354, "eval_runtime": 2.0075, "eval_samples_per_second": 79.202, "eval_steps_per_second": 4.981, "step": 5265 }, { "epoch": 468.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3241185247898102, "eval_runtime": 2.079, "eval_samples_per_second": 76.48, "eval_steps_per_second": 4.81, "step": 5276 }, { "epoch": 469.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.32361313700675964, "eval_runtime": 2.2276, "eval_samples_per_second": 71.378, "eval_steps_per_second": 4.489, "step": 5287 }, { "epoch": 470.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.32213094830513, "eval_runtime": 2.0555, "eval_samples_per_second": 77.353, "eval_steps_per_second": 4.865, "step": 5298 }, { "epoch": 471.11, "grad_norm": 2.2473459243774414, "learning_rate": 1.3261363636363636e-05, "loss": 0.0114, "step": 5300 }, { "epoch": 472.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2958085536956787, "eval_runtime": 2.1042, "eval_samples_per_second": 75.563, "eval_steps_per_second": 4.752, "step": 5310 }, { "epoch": 472.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.2994365692138672, "eval_runtime": 2.0536, "eval_samples_per_second": 77.424, "eval_steps_per_second": 4.869, "step": 5321 }, { "epoch": 473.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.29937687516212463, "eval_runtime": 2.0807, "eval_samples_per_second": 76.417, "eval_steps_per_second": 4.806, "step": 5332 }, { "epoch": 474.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.4239935576915741, "eval_runtime": 2.0885, "eval_samples_per_second": 76.13, "eval_steps_per_second": 4.788, "step": 5343 }, { "epoch": 475.56, "grad_norm": 0.01770736277103424, "learning_rate": 1.3071969696969698e-05, "loss": 0.0148, "step": 5350 }, { "epoch": 476.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.32858237624168396, "eval_runtime": 2.0527, "eval_samples_per_second": 77.46, "eval_steps_per_second": 4.872, "step": 5355 }, { "epoch": 476.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2954269051551819, "eval_runtime": 2.1594, "eval_samples_per_second": 73.63, "eval_steps_per_second": 4.631, "step": 5366 }, { "epoch": 477.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.29593905806541443, "eval_runtime": 2.1654, "eval_samples_per_second": 73.426, "eval_steps_per_second": 4.618, "step": 5377 }, { "epoch": 478.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2928108274936676, "eval_runtime": 2.2433, "eval_samples_per_second": 70.877, "eval_steps_per_second": 4.458, "step": 5388 }, { "epoch": 480.0, "grad_norm": 1.7406607866287231, "learning_rate": 1.2882575757575757e-05, "loss": 0.0171, "step": 5400 }, { "epoch": 480.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.2977100610733032, "eval_runtime": 2.0243, "eval_samples_per_second": 78.544, "eval_steps_per_second": 4.94, "step": 5400 }, { "epoch": 480.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.30747535824775696, "eval_runtime": 2.0298, "eval_samples_per_second": 78.334, "eval_steps_per_second": 4.927, "step": 5411 }, { "epoch": 481.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3572753071784973, "eval_runtime": 2.0524, "eval_samples_per_second": 77.47, "eval_steps_per_second": 4.872, "step": 5422 }, { "epoch": 482.93, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3878822326660156, "eval_runtime": 2.0986, "eval_samples_per_second": 75.766, "eval_steps_per_second": 4.765, "step": 5433 }, { "epoch": 484.0, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.3886529803276062, "eval_runtime": 2.078, "eval_samples_per_second": 76.517, "eval_steps_per_second": 4.812, "step": 5445 }, { "epoch": 484.44, "grad_norm": 0.06283226609230042, "learning_rate": 1.2693181818181818e-05, "loss": 0.0166, "step": 5450 }, { "epoch": 484.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3698625862598419, "eval_runtime": 2.094, "eval_samples_per_second": 75.932, "eval_steps_per_second": 4.776, "step": 5456 }, { "epoch": 485.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.351385235786438, "eval_runtime": 2.0668, "eval_samples_per_second": 76.93, "eval_steps_per_second": 4.838, "step": 5467 }, { "epoch": 486.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34395086765289307, "eval_runtime": 2.12, "eval_samples_per_second": 74.999, "eval_steps_per_second": 4.717, "step": 5478 }, { "epoch": 488.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.31205570697784424, "eval_runtime": 2.2336, "eval_samples_per_second": 71.184, "eval_steps_per_second": 4.477, "step": 5490 }, { "epoch": 488.89, "grad_norm": 1.9271873235702515, "learning_rate": 1.2503787878787879e-05, "loss": 0.0169, "step": 5500 }, { "epoch": 488.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3185611069202423, "eval_runtime": 1.9966, "eval_samples_per_second": 79.635, "eval_steps_per_second": 5.008, "step": 5501 }, { "epoch": 489.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3383605182170868, "eval_runtime": 2.0874, "eval_samples_per_second": 76.17, "eval_steps_per_second": 4.791, "step": 5512 }, { "epoch": 490.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.35870400071144104, "eval_runtime": 2.2491, "eval_samples_per_second": 70.694, "eval_steps_per_second": 4.446, "step": 5523 }, { "epoch": 492.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3265625238418579, "eval_runtime": 2.0134, "eval_samples_per_second": 78.971, "eval_steps_per_second": 4.967, "step": 5535 }, { "epoch": 492.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3273981213569641, "eval_runtime": 2.0972, "eval_samples_per_second": 75.815, "eval_steps_per_second": 4.768, "step": 5546 }, { "epoch": 493.33, "grad_norm": 0.3140685260295868, "learning_rate": 1.2314393939393941e-05, "loss": 0.0162, "step": 5550 }, { "epoch": 493.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3433980345726013, "eval_runtime": 4.4757, "eval_samples_per_second": 35.525, "eval_steps_per_second": 2.234, "step": 5557 }, { "epoch": 494.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3295518755912781, "eval_runtime": 2.0317, "eval_samples_per_second": 78.259, "eval_steps_per_second": 4.922, "step": 5568 }, { "epoch": 496.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.31786414980888367, "eval_runtime": 2.1435, "eval_samples_per_second": 74.179, "eval_steps_per_second": 4.665, "step": 5580 }, { "epoch": 496.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32228994369506836, "eval_runtime": 2.0036, "eval_samples_per_second": 79.357, "eval_steps_per_second": 4.991, "step": 5591 }, { "epoch": 497.78, "grad_norm": 1.7739616632461548, "learning_rate": 1.2125e-05, "loss": 0.0128, "step": 5600 }, { "epoch": 497.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3525673747062683, "eval_runtime": 2.0848, "eval_samples_per_second": 76.266, "eval_steps_per_second": 4.797, "step": 5602 }, { "epoch": 498.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3345227539539337, "eval_runtime": 2.0597, "eval_samples_per_second": 77.194, "eval_steps_per_second": 4.855, "step": 5613 }, { "epoch": 500.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3081194758415222, "eval_runtime": 2.14, "eval_samples_per_second": 74.297, "eval_steps_per_second": 4.673, "step": 5625 }, { "epoch": 500.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3136290907859802, "eval_runtime": 2.0866, "eval_samples_per_second": 76.201, "eval_steps_per_second": 4.793, "step": 5636 }, { "epoch": 501.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31603533029556274, "eval_runtime": 2.272, "eval_samples_per_second": 69.983, "eval_steps_per_second": 4.401, "step": 5647 }, { "epoch": 502.22, "grad_norm": 0.024508927017450333, "learning_rate": 1.193560606060606e-05, "loss": 0.0089, "step": 5650 }, { "epoch": 502.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3217502236366272, "eval_runtime": 2.2568, "eval_samples_per_second": 70.454, "eval_steps_per_second": 4.431, "step": 5658 }, { "epoch": 504.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3330020606517792, "eval_runtime": 2.1528, "eval_samples_per_second": 73.857, "eval_steps_per_second": 4.645, "step": 5670 }, { "epoch": 504.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3610976040363312, "eval_runtime": 2.1981, "eval_samples_per_second": 72.335, "eval_steps_per_second": 4.549, "step": 5681 }, { "epoch": 505.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3819771111011505, "eval_runtime": 2.0605, "eval_samples_per_second": 77.167, "eval_steps_per_second": 4.853, "step": 5692 }, { "epoch": 506.67, "grad_norm": 0.13250546157360077, "learning_rate": 1.1746212121212121e-05, "loss": 0.0168, "step": 5700 }, { "epoch": 506.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3471725881099701, "eval_runtime": 2.0816, "eval_samples_per_second": 76.384, "eval_steps_per_second": 4.804, "step": 5703 }, { "epoch": 508.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3075188100337982, "eval_runtime": 2.1057, "eval_samples_per_second": 75.51, "eval_steps_per_second": 4.749, "step": 5715 }, { "epoch": 508.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.30466988682746887, "eval_runtime": 2.1027, "eval_samples_per_second": 75.617, "eval_steps_per_second": 4.756, "step": 5726 }, { "epoch": 509.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.314418226480484, "eval_runtime": 2.1578, "eval_samples_per_second": 73.686, "eval_steps_per_second": 4.634, "step": 5737 }, { "epoch": 510.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3144315183162689, "eval_runtime": 2.2895, "eval_samples_per_second": 69.447, "eval_steps_per_second": 4.368, "step": 5748 }, { "epoch": 511.11, "grad_norm": 1.371584415435791, "learning_rate": 1.1556818181818184e-05, "loss": 0.0143, "step": 5750 }, { "epoch": 512.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.30977222323417664, "eval_runtime": 2.0905, "eval_samples_per_second": 76.059, "eval_steps_per_second": 4.784, "step": 5760 }, { "epoch": 512.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.31324854493141174, "eval_runtime": 2.2018, "eval_samples_per_second": 72.212, "eval_steps_per_second": 4.542, "step": 5771 }, { "epoch": 513.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3324536979198456, "eval_runtime": 2.096, "eval_samples_per_second": 75.859, "eval_steps_per_second": 4.771, "step": 5782 }, { "epoch": 514.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32093632221221924, "eval_runtime": 2.049, "eval_samples_per_second": 77.599, "eval_steps_per_second": 4.88, "step": 5793 }, { "epoch": 515.56, "grad_norm": 1.4226562976837158, "learning_rate": 1.1367424242424243e-05, "loss": 0.014, "step": 5800 }, { "epoch": 516.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3191947937011719, "eval_runtime": 2.0898, "eval_samples_per_second": 76.083, "eval_steps_per_second": 4.785, "step": 5805 }, { "epoch": 516.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.311814546585083, "eval_runtime": 2.0315, "eval_samples_per_second": 78.269, "eval_steps_per_second": 4.923, "step": 5816 }, { "epoch": 517.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.31416967511177063, "eval_runtime": 2.0132, "eval_samples_per_second": 78.978, "eval_steps_per_second": 4.967, "step": 5827 }, { "epoch": 518.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3255424201488495, "eval_runtime": 2.4361, "eval_samples_per_second": 65.269, "eval_steps_per_second": 4.105, "step": 5838 }, { "epoch": 520.0, "grad_norm": 0.1621515154838562, "learning_rate": 1.1178030303030303e-05, "loss": 0.0111, "step": 5850 }, { "epoch": 520.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.32208895683288574, "eval_runtime": 2.0821, "eval_samples_per_second": 76.364, "eval_steps_per_second": 4.803, "step": 5850 }, { "epoch": 520.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3211723566055298, "eval_runtime": 2.0312, "eval_samples_per_second": 78.28, "eval_steps_per_second": 4.923, "step": 5861 }, { "epoch": 521.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.32905757427215576, "eval_runtime": 2.0294, "eval_samples_per_second": 78.349, "eval_steps_per_second": 4.928, "step": 5872 }, { "epoch": 522.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.33144110441207886, "eval_runtime": 2.032, "eval_samples_per_second": 78.249, "eval_steps_per_second": 4.921, "step": 5883 }, { "epoch": 524.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3268250823020935, "eval_runtime": 2.0687, "eval_samples_per_second": 76.859, "eval_steps_per_second": 4.834, "step": 5895 }, { "epoch": 524.44, "grad_norm": 0.008243849501013756, "learning_rate": 1.0988636363636364e-05, "loss": 0.0107, "step": 5900 }, { "epoch": 524.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3351696729660034, "eval_runtime": 2.155, "eval_samples_per_second": 73.782, "eval_steps_per_second": 4.64, "step": 5906 }, { "epoch": 525.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34242841601371765, "eval_runtime": 2.0063, "eval_samples_per_second": 79.249, "eval_steps_per_second": 4.984, "step": 5917 }, { "epoch": 526.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.33888906240463257, "eval_runtime": 2.2365, "eval_samples_per_second": 71.093, "eval_steps_per_second": 4.471, "step": 5928 }, { "epoch": 528.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3547358810901642, "eval_runtime": 2.0755, "eval_samples_per_second": 76.609, "eval_steps_per_second": 4.818, "step": 5940 }, { "epoch": 528.89, "grad_norm": 0.47511938214302063, "learning_rate": 1.0799242424242423e-05, "loss": 0.01, "step": 5950 }, { "epoch": 528.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.34747716784477234, "eval_runtime": 2.0823, "eval_samples_per_second": 76.358, "eval_steps_per_second": 4.802, "step": 5951 }, { "epoch": 529.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35945838689804077, "eval_runtime": 2.0524, "eval_samples_per_second": 77.469, "eval_steps_per_second": 4.872, "step": 5962 }, { "epoch": 530.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3673442602157593, "eval_runtime": 2.0276, "eval_samples_per_second": 78.419, "eval_steps_per_second": 4.932, "step": 5973 }, { "epoch": 532.0, "eval_accuracy": 0.9119496855345912, "eval_loss": 0.41652363538742065, "eval_runtime": 2.0573, "eval_samples_per_second": 77.285, "eval_steps_per_second": 4.861, "step": 5985 }, { "epoch": 532.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.42472416162490845, "eval_runtime": 2.1003, "eval_samples_per_second": 75.704, "eval_steps_per_second": 4.761, "step": 5996 }, { "epoch": 533.33, "grad_norm": 0.15851238369941711, "learning_rate": 1.0609848484848485e-05, "loss": 0.0126, "step": 6000 }, { "epoch": 533.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.4061521589756012, "eval_runtime": 2.0889, "eval_samples_per_second": 76.116, "eval_steps_per_second": 4.787, "step": 6007 }, { "epoch": 534.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3752112090587616, "eval_runtime": 2.0476, "eval_samples_per_second": 77.651, "eval_steps_per_second": 4.884, "step": 6018 }, { "epoch": 536.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.35743284225463867, "eval_runtime": 2.2159, "eval_samples_per_second": 71.753, "eval_steps_per_second": 4.513, "step": 6030 }, { "epoch": 536.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3824201226234436, "eval_runtime": 2.0455, "eval_samples_per_second": 77.732, "eval_steps_per_second": 4.889, "step": 6041 }, { "epoch": 537.78, "grad_norm": 0.0922364741563797, "learning_rate": 1.0420454545454546e-05, "loss": 0.0126, "step": 6050 }, { "epoch": 537.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3730430006980896, "eval_runtime": 2.1192, "eval_samples_per_second": 75.028, "eval_steps_per_second": 4.719, "step": 6052 }, { "epoch": 538.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3703514337539673, "eval_runtime": 2.2056, "eval_samples_per_second": 72.091, "eval_steps_per_second": 4.534, "step": 6063 }, { "epoch": 540.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.38142630457878113, "eval_runtime": 2.0818, "eval_samples_per_second": 76.376, "eval_steps_per_second": 4.804, "step": 6075 }, { "epoch": 540.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3648853302001953, "eval_runtime": 2.2199, "eval_samples_per_second": 71.625, "eval_steps_per_second": 4.505, "step": 6086 }, { "epoch": 541.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3810517489910126, "eval_runtime": 2.0826, "eval_samples_per_second": 76.345, "eval_steps_per_second": 4.802, "step": 6097 }, { "epoch": 542.22, "grad_norm": 0.04241061210632324, "learning_rate": 1.0231060606060607e-05, "loss": 0.012, "step": 6100 }, { "epoch": 542.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3544082045555115, "eval_runtime": 2.08, "eval_samples_per_second": 76.442, "eval_steps_per_second": 4.808, "step": 6108 }, { "epoch": 544.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3614555597305298, "eval_runtime": 2.2123, "eval_samples_per_second": 71.871, "eval_steps_per_second": 4.52, "step": 6120 }, { "epoch": 544.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35575661063194275, "eval_runtime": 2.1324, "eval_samples_per_second": 74.564, "eval_steps_per_second": 4.69, "step": 6131 }, { "epoch": 545.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.34816914796829224, "eval_runtime": 2.0819, "eval_samples_per_second": 76.371, "eval_steps_per_second": 4.803, "step": 6142 }, { "epoch": 546.67, "grad_norm": 0.5738076567649841, "learning_rate": 1.0041666666666666e-05, "loss": 0.0135, "step": 6150 }, { "epoch": 546.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.36677080392837524, "eval_runtime": 2.1421, "eval_samples_per_second": 74.226, "eval_steps_per_second": 4.668, "step": 6153 }, { "epoch": 548.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34037116169929504, "eval_runtime": 2.0657, "eval_samples_per_second": 76.972, "eval_steps_per_second": 4.841, "step": 6165 }, { "epoch": 548.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33401021361351013, "eval_runtime": 2.0325, "eval_samples_per_second": 78.229, "eval_steps_per_second": 4.92, "step": 6176 }, { "epoch": 549.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3377488851547241, "eval_runtime": 2.1646, "eval_samples_per_second": 73.456, "eval_steps_per_second": 4.62, "step": 6187 }, { "epoch": 550.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3406839966773987, "eval_runtime": 2.1382, "eval_samples_per_second": 74.36, "eval_steps_per_second": 4.677, "step": 6198 }, { "epoch": 551.11, "grad_norm": 0.34322044253349304, "learning_rate": 9.852272727272728e-06, "loss": 0.0101, "step": 6200 }, { "epoch": 552.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.33890071511268616, "eval_runtime": 2.0917, "eval_samples_per_second": 76.015, "eval_steps_per_second": 4.781, "step": 6210 }, { "epoch": 552.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33053550124168396, "eval_runtime": 2.2779, "eval_samples_per_second": 69.8, "eval_steps_per_second": 4.39, "step": 6221 }, { "epoch": 553.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.31986501812934875, "eval_runtime": 1.9669, "eval_samples_per_second": 80.836, "eval_steps_per_second": 5.084, "step": 6232 }, { "epoch": 554.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33377256989479065, "eval_runtime": 2.0395, "eval_samples_per_second": 77.96, "eval_steps_per_second": 4.903, "step": 6243 }, { "epoch": 555.56, "grad_norm": 0.1416609138250351, "learning_rate": 9.662878787878789e-06, "loss": 0.0175, "step": 6250 }, { "epoch": 556.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33228349685668945, "eval_runtime": 2.1542, "eval_samples_per_second": 73.811, "eval_steps_per_second": 4.642, "step": 6255 }, { "epoch": 556.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.340250164270401, "eval_runtime": 2.0563, "eval_samples_per_second": 77.325, "eval_steps_per_second": 4.863, "step": 6266 }, { "epoch": 557.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34735485911369324, "eval_runtime": 2.0285, "eval_samples_per_second": 78.384, "eval_steps_per_second": 4.93, "step": 6277 }, { "epoch": 558.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34990042448043823, "eval_runtime": 2.1967, "eval_samples_per_second": 72.38, "eval_steps_per_second": 4.552, "step": 6288 }, { "epoch": 560.0, "grad_norm": 0.09764547646045685, "learning_rate": 9.473484848484848e-06, "loss": 0.0108, "step": 6300 }, { "epoch": 560.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.342894971370697, "eval_runtime": 2.026, "eval_samples_per_second": 78.479, "eval_steps_per_second": 4.936, "step": 6300 }, { "epoch": 560.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3396158218383789, "eval_runtime": 2.3052, "eval_samples_per_second": 68.976, "eval_steps_per_second": 4.338, "step": 6311 }, { "epoch": 561.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3467164933681488, "eval_runtime": 2.0425, "eval_samples_per_second": 77.846, "eval_steps_per_second": 4.896, "step": 6322 }, { "epoch": 562.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3349219858646393, "eval_runtime": 2.0651, "eval_samples_per_second": 76.992, "eval_steps_per_second": 4.842, "step": 6333 }, { "epoch": 564.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3380991518497467, "eval_runtime": 2.111, "eval_samples_per_second": 75.32, "eval_steps_per_second": 4.737, "step": 6345 }, { "epoch": 564.44, "grad_norm": 0.021107789129018784, "learning_rate": 9.284090909090908e-06, "loss": 0.0139, "step": 6350 }, { "epoch": 564.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.32741737365722656, "eval_runtime": 2.1143, "eval_samples_per_second": 75.203, "eval_steps_per_second": 4.73, "step": 6356 }, { "epoch": 565.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3318650722503662, "eval_runtime": 1.9953, "eval_samples_per_second": 79.688, "eval_steps_per_second": 5.012, "step": 6367 }, { "epoch": 566.93, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.33214500546455383, "eval_runtime": 2.0923, "eval_samples_per_second": 75.992, "eval_steps_per_second": 4.779, "step": 6378 }, { "epoch": 568.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3546938896179199, "eval_runtime": 2.1191, "eval_samples_per_second": 75.033, "eval_steps_per_second": 4.719, "step": 6390 }, { "epoch": 568.89, "grad_norm": 1.3278522491455078, "learning_rate": 9.09469696969697e-06, "loss": 0.0138, "step": 6400 }, { "epoch": 568.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.366202175617218, "eval_runtime": 2.0849, "eval_samples_per_second": 76.261, "eval_steps_per_second": 4.796, "step": 6401 }, { "epoch": 569.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.34554189443588257, "eval_runtime": 2.2433, "eval_samples_per_second": 70.878, "eval_steps_per_second": 4.458, "step": 6412 }, { "epoch": 570.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3477872908115387, "eval_runtime": 2.0921, "eval_samples_per_second": 76.0, "eval_steps_per_second": 4.78, "step": 6423 }, { "epoch": 572.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3400007486343384, "eval_runtime": 2.0746, "eval_samples_per_second": 76.641, "eval_steps_per_second": 4.82, "step": 6435 }, { "epoch": 572.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3512841463088989, "eval_runtime": 2.0975, "eval_samples_per_second": 75.803, "eval_steps_per_second": 4.767, "step": 6446 }, { "epoch": 573.33, "grad_norm": 0.1855485886335373, "learning_rate": 8.905303030303031e-06, "loss": 0.0095, "step": 6450 }, { "epoch": 573.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3461546301841736, "eval_runtime": 2.067, "eval_samples_per_second": 76.921, "eval_steps_per_second": 4.838, "step": 6457 }, { "epoch": 574.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.33488187193870544, "eval_runtime": 2.0691, "eval_samples_per_second": 76.846, "eval_steps_per_second": 4.833, "step": 6468 }, { "epoch": 576.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.337620347738266, "eval_runtime": 2.018, "eval_samples_per_second": 78.793, "eval_steps_per_second": 4.956, "step": 6480 }, { "epoch": 576.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.33732709288597107, "eval_runtime": 2.0922, "eval_samples_per_second": 75.996, "eval_steps_per_second": 4.78, "step": 6491 }, { "epoch": 577.78, "grad_norm": 0.9204933643341064, "learning_rate": 8.71590909090909e-06, "loss": 0.0138, "step": 6500 }, { "epoch": 577.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3310604989528656, "eval_runtime": 2.1334, "eval_samples_per_second": 74.528, "eval_steps_per_second": 4.687, "step": 6502 }, { "epoch": 578.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.33120694756507874, "eval_runtime": 2.1395, "eval_samples_per_second": 74.316, "eval_steps_per_second": 4.674, "step": 6513 }, { "epoch": 580.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3291258215904236, "eval_runtime": 2.1193, "eval_samples_per_second": 75.024, "eval_steps_per_second": 4.719, "step": 6525 }, { "epoch": 580.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3441867232322693, "eval_runtime": 2.081, "eval_samples_per_second": 76.405, "eval_steps_per_second": 4.805, "step": 6536 }, { "epoch": 581.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3806348145008087, "eval_runtime": 2.08, "eval_samples_per_second": 76.443, "eval_steps_per_second": 4.808, "step": 6547 }, { "epoch": 582.22, "grad_norm": 1.3162257671356201, "learning_rate": 8.526515151515151e-06, "loss": 0.0163, "step": 6550 }, { "epoch": 582.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.39340561628341675, "eval_runtime": 2.0419, "eval_samples_per_second": 77.868, "eval_steps_per_second": 4.897, "step": 6558 }, { "epoch": 584.0, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3990216851234436, "eval_runtime": 2.049, "eval_samples_per_second": 77.599, "eval_steps_per_second": 4.88, "step": 6570 }, { "epoch": 584.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.353302925825119, "eval_runtime": 2.1595, "eval_samples_per_second": 73.629, "eval_steps_per_second": 4.631, "step": 6581 }, { "epoch": 585.96, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.34103333950042725, "eval_runtime": 2.2099, "eval_samples_per_second": 71.948, "eval_steps_per_second": 4.525, "step": 6592 }, { "epoch": 586.67, "grad_norm": 0.35993504524230957, "learning_rate": 8.337121212121213e-06, "loss": 0.0152, "step": 6600 }, { "epoch": 586.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3351433575153351, "eval_runtime": 2.2699, "eval_samples_per_second": 70.046, "eval_steps_per_second": 4.405, "step": 6603 }, { "epoch": 588.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3369242250919342, "eval_runtime": 2.117, "eval_samples_per_second": 75.106, "eval_steps_per_second": 4.724, "step": 6615 }, { "epoch": 588.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35417425632476807, "eval_runtime": 2.144, "eval_samples_per_second": 74.161, "eval_steps_per_second": 4.664, "step": 6626 }, { "epoch": 589.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3728938102722168, "eval_runtime": 2.0531, "eval_samples_per_second": 77.443, "eval_steps_per_second": 4.871, "step": 6637 }, { "epoch": 590.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34069618582725525, "eval_runtime": 2.1327, "eval_samples_per_second": 74.555, "eval_steps_per_second": 4.689, "step": 6648 }, { "epoch": 591.11, "grad_norm": 0.19336657226085663, "learning_rate": 8.147727272727274e-06, "loss": 0.017, "step": 6650 }, { "epoch": 592.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3440462052822113, "eval_runtime": 2.0686, "eval_samples_per_second": 76.865, "eval_steps_per_second": 4.834, "step": 6660 }, { "epoch": 592.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3493140935897827, "eval_runtime": 2.0648, "eval_samples_per_second": 77.004, "eval_steps_per_second": 4.843, "step": 6671 }, { "epoch": 593.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.37120524048805237, "eval_runtime": 2.2033, "eval_samples_per_second": 72.165, "eval_steps_per_second": 4.539, "step": 6682 }, { "epoch": 594.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.36460721492767334, "eval_runtime": 2.2563, "eval_samples_per_second": 70.47, "eval_steps_per_second": 4.432, "step": 6693 }, { "epoch": 595.56, "grad_norm": 0.017406007274985313, "learning_rate": 7.958333333333333e-06, "loss": 0.0113, "step": 6700 }, { "epoch": 596.0, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.36630791425704956, "eval_runtime": 2.0788, "eval_samples_per_second": 76.486, "eval_steps_per_second": 4.81, "step": 6705 }, { "epoch": 596.98, "eval_accuracy": 0.9245283018867925, "eval_loss": 0.3725621700286865, "eval_runtime": 2.226, "eval_samples_per_second": 71.429, "eval_steps_per_second": 4.492, "step": 6716 }, { "epoch": 597.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.35295018553733826, "eval_runtime": 2.16, "eval_samples_per_second": 73.611, "eval_steps_per_second": 4.63, "step": 6727 }, { "epoch": 598.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3451589047908783, "eval_runtime": 2.0598, "eval_samples_per_second": 77.193, "eval_steps_per_second": 4.855, "step": 6738 }, { "epoch": 600.0, "grad_norm": 0.1029694527387619, "learning_rate": 7.768939393939394e-06, "loss": 0.0115, "step": 6750 }, { "epoch": 600.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3340095281600952, "eval_runtime": 2.1945, "eval_samples_per_second": 72.455, "eval_steps_per_second": 4.557, "step": 6750 }, { "epoch": 600.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34892547130584717, "eval_runtime": 2.1247, "eval_samples_per_second": 74.836, "eval_steps_per_second": 4.707, "step": 6761 }, { "epoch": 601.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3408372402191162, "eval_runtime": 2.1827, "eval_samples_per_second": 72.846, "eval_steps_per_second": 4.582, "step": 6772 }, { "epoch": 602.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3423627018928528, "eval_runtime": 2.2182, "eval_samples_per_second": 71.68, "eval_steps_per_second": 4.508, "step": 6783 }, { "epoch": 604.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34804755449295044, "eval_runtime": 2.1754, "eval_samples_per_second": 73.091, "eval_steps_per_second": 4.597, "step": 6795 }, { "epoch": 604.44, "grad_norm": 0.7808576822280884, "learning_rate": 7.579545454545454e-06, "loss": 0.0132, "step": 6800 }, { "epoch": 604.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34386932849884033, "eval_runtime": 2.0311, "eval_samples_per_second": 78.283, "eval_steps_per_second": 4.923, "step": 6806 }, { "epoch": 605.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3530921936035156, "eval_runtime": 2.102, "eval_samples_per_second": 75.641, "eval_steps_per_second": 4.757, "step": 6817 }, { "epoch": 606.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3807942271232605, "eval_runtime": 2.1101, "eval_samples_per_second": 75.351, "eval_steps_per_second": 4.739, "step": 6828 }, { "epoch": 608.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3441016674041748, "eval_runtime": 2.1163, "eval_samples_per_second": 75.133, "eval_steps_per_second": 4.725, "step": 6840 }, { "epoch": 608.89, "grad_norm": 0.31322968006134033, "learning_rate": 7.390151515151515e-06, "loss": 0.014, "step": 6850 }, { "epoch": 608.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3534349203109741, "eval_runtime": 2.0731, "eval_samples_per_second": 76.696, "eval_steps_per_second": 4.824, "step": 6851 }, { "epoch": 609.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3583095371723175, "eval_runtime": 2.1365, "eval_samples_per_second": 74.419, "eval_steps_per_second": 4.68, "step": 6862 }, { "epoch": 610.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3640231490135193, "eval_runtime": 2.3226, "eval_samples_per_second": 68.457, "eval_steps_per_second": 4.305, "step": 6873 }, { "epoch": 612.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3587685227394104, "eval_runtime": 2.0532, "eval_samples_per_second": 77.44, "eval_steps_per_second": 4.87, "step": 6885 }, { "epoch": 612.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3662501275539398, "eval_runtime": 2.1672, "eval_samples_per_second": 73.368, "eval_steps_per_second": 4.614, "step": 6896 }, { "epoch": 613.33, "grad_norm": 1.508801817893982, "learning_rate": 7.200757575757576e-06, "loss": 0.0089, "step": 6900 }, { "epoch": 613.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3788923919200897, "eval_runtime": 2.0361, "eval_samples_per_second": 78.092, "eval_steps_per_second": 4.911, "step": 6907 }, { "epoch": 614.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.378842294216156, "eval_runtime": 2.0538, "eval_samples_per_second": 77.417, "eval_steps_per_second": 4.869, "step": 6918 }, { "epoch": 616.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3528358042240143, "eval_runtime": 2.0973, "eval_samples_per_second": 75.811, "eval_steps_per_second": 4.768, "step": 6930 }, { "epoch": 616.98, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3626009523868561, "eval_runtime": 2.1285, "eval_samples_per_second": 74.701, "eval_steps_per_second": 4.698, "step": 6941 }, { "epoch": 617.78, "grad_norm": 0.027906352654099464, "learning_rate": 7.0113636363636365e-06, "loss": 0.0135, "step": 6950 }, { "epoch": 617.96, "eval_accuracy": 0.9182389937106918, "eval_loss": 0.3760795593261719, "eval_runtime": 2.0573, "eval_samples_per_second": 77.285, "eval_steps_per_second": 4.861, "step": 6952 }, { "epoch": 618.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3911431133747101, "eval_runtime": 2.3187, "eval_samples_per_second": 68.573, "eval_steps_per_second": 4.313, "step": 6963 }, { "epoch": 620.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3900914192199707, "eval_runtime": 2.1186, "eval_samples_per_second": 75.049, "eval_steps_per_second": 4.72, "step": 6975 }, { "epoch": 620.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.4003194272518158, "eval_runtime": 2.1007, "eval_samples_per_second": 75.689, "eval_steps_per_second": 4.76, "step": 6986 }, { "epoch": 621.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.36526620388031006, "eval_runtime": 2.1753, "eval_samples_per_second": 73.093, "eval_steps_per_second": 4.597, "step": 6997 }, { "epoch": 622.22, "grad_norm": 0.05157339572906494, "learning_rate": 6.821969696969697e-06, "loss": 0.0071, "step": 7000 }, { "epoch": 622.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.33499374985694885, "eval_runtime": 2.11, "eval_samples_per_second": 75.356, "eval_steps_per_second": 4.739, "step": 7008 }, { "epoch": 624.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3353654444217682, "eval_runtime": 2.0902, "eval_samples_per_second": 76.069, "eval_steps_per_second": 4.784, "step": 7020 }, { "epoch": 624.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.37156394124031067, "eval_runtime": 2.0375, "eval_samples_per_second": 78.038, "eval_steps_per_second": 4.908, "step": 7031 }, { "epoch": 625.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3520486354827881, "eval_runtime": 2.0907, "eval_samples_per_second": 76.051, "eval_steps_per_second": 4.783, "step": 7042 }, { "epoch": 626.67, "grad_norm": 0.5914948582649231, "learning_rate": 6.632575757575758e-06, "loss": 0.0129, "step": 7050 }, { "epoch": 626.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3306971490383148, "eval_runtime": 2.0739, "eval_samples_per_second": 76.667, "eval_steps_per_second": 4.822, "step": 7053 }, { "epoch": 628.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33053889870643616, "eval_runtime": 2.2479, "eval_samples_per_second": 70.731, "eval_steps_per_second": 4.449, "step": 7065 }, { "epoch": 628.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3301643431186676, "eval_runtime": 1.9998, "eval_samples_per_second": 79.509, "eval_steps_per_second": 5.001, "step": 7076 }, { "epoch": 629.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3290785253047943, "eval_runtime": 2.0356, "eval_samples_per_second": 78.11, "eval_steps_per_second": 4.913, "step": 7087 }, { "epoch": 630.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3329908847808838, "eval_runtime": 2.0116, "eval_samples_per_second": 79.04, "eval_steps_per_second": 4.971, "step": 7098 }, { "epoch": 631.11, "grad_norm": 1.9344037771224976, "learning_rate": 6.4431818181818185e-06, "loss": 0.0091, "step": 7100 }, { "epoch": 632.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3331502079963684, "eval_runtime": 2.1322, "eval_samples_per_second": 74.572, "eval_steps_per_second": 4.69, "step": 7110 }, { "epoch": 632.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33215317130088806, "eval_runtime": 2.0089, "eval_samples_per_second": 79.146, "eval_steps_per_second": 4.978, "step": 7121 }, { "epoch": 633.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3437711000442505, "eval_runtime": 2.1614, "eval_samples_per_second": 73.562, "eval_steps_per_second": 4.627, "step": 7132 }, { "epoch": 634.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.36110153794288635, "eval_runtime": 2.1038, "eval_samples_per_second": 75.577, "eval_steps_per_second": 4.753, "step": 7143 }, { "epoch": 635.56, "grad_norm": 0.008998346514999866, "learning_rate": 6.253787878787879e-06, "loss": 0.0107, "step": 7150 }, { "epoch": 636.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34894272685050964, "eval_runtime": 2.1178, "eval_samples_per_second": 75.077, "eval_steps_per_second": 4.722, "step": 7155 }, { "epoch": 636.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3357524573802948, "eval_runtime": 2.1256, "eval_samples_per_second": 74.803, "eval_steps_per_second": 4.705, "step": 7166 }, { "epoch": 637.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3372538983821869, "eval_runtime": 2.0938, "eval_samples_per_second": 75.939, "eval_steps_per_second": 4.776, "step": 7177 }, { "epoch": 638.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3444075584411621, "eval_runtime": 2.1377, "eval_samples_per_second": 74.379, "eval_steps_per_second": 4.678, "step": 7188 }, { "epoch": 640.0, "grad_norm": 0.753413736820221, "learning_rate": 6.06439393939394e-06, "loss": 0.0125, "step": 7200 }, { "epoch": 640.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.36328038573265076, "eval_runtime": 2.0555, "eval_samples_per_second": 77.354, "eval_steps_per_second": 4.865, "step": 7200 }, { "epoch": 640.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3562980592250824, "eval_runtime": 2.0343, "eval_samples_per_second": 78.159, "eval_steps_per_second": 4.916, "step": 7211 }, { "epoch": 641.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.35727426409721375, "eval_runtime": 2.0513, "eval_samples_per_second": 77.513, "eval_steps_per_second": 4.875, "step": 7222 }, { "epoch": 642.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3534907400608063, "eval_runtime": 2.109, "eval_samples_per_second": 75.393, "eval_steps_per_second": 4.742, "step": 7233 }, { "epoch": 644.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34685295820236206, "eval_runtime": 2.1171, "eval_samples_per_second": 75.104, "eval_steps_per_second": 4.724, "step": 7245 }, { "epoch": 644.44, "grad_norm": 0.040267378091812134, "learning_rate": 5.8750000000000005e-06, "loss": 0.0071, "step": 7250 }, { "epoch": 644.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34481677412986755, "eval_runtime": 2.2433, "eval_samples_per_second": 70.878, "eval_steps_per_second": 4.458, "step": 7256 }, { "epoch": 645.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3445126414299011, "eval_runtime": 2.09, "eval_samples_per_second": 76.075, "eval_steps_per_second": 4.785, "step": 7267 }, { "epoch": 646.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3418070077896118, "eval_runtime": 2.1179, "eval_samples_per_second": 75.074, "eval_steps_per_second": 4.722, "step": 7278 }, { "epoch": 648.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3541422188282013, "eval_runtime": 2.0491, "eval_samples_per_second": 77.596, "eval_steps_per_second": 4.88, "step": 7290 }, { "epoch": 648.89, "grad_norm": 0.02006547898054123, "learning_rate": 5.685606060606061e-06, "loss": 0.0076, "step": 7300 }, { "epoch": 648.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34063196182250977, "eval_runtime": 2.1334, "eval_samples_per_second": 74.528, "eval_steps_per_second": 4.687, "step": 7301 }, { "epoch": 649.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3326892852783203, "eval_runtime": 2.0215, "eval_samples_per_second": 78.656, "eval_steps_per_second": 4.947, "step": 7312 }, { "epoch": 650.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3381519019603729, "eval_runtime": 2.1234, "eval_samples_per_second": 74.878, "eval_steps_per_second": 4.709, "step": 7323 }, { "epoch": 652.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3574288785457611, "eval_runtime": 2.2212, "eval_samples_per_second": 71.583, "eval_steps_per_second": 4.502, "step": 7335 }, { "epoch": 652.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3462476134300232, "eval_runtime": 2.3846, "eval_samples_per_second": 66.678, "eval_steps_per_second": 4.194, "step": 7346 }, { "epoch": 653.33, "grad_norm": 0.1632642298936844, "learning_rate": 5.5e-06, "loss": 0.0131, "step": 7350 }, { "epoch": 653.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33882516622543335, "eval_runtime": 2.0171, "eval_samples_per_second": 78.826, "eval_steps_per_second": 4.958, "step": 7357 }, { "epoch": 654.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.337929904460907, "eval_runtime": 2.1283, "eval_samples_per_second": 74.708, "eval_steps_per_second": 4.699, "step": 7368 }, { "epoch": 656.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3396049737930298, "eval_runtime": 2.0868, "eval_samples_per_second": 76.193, "eval_steps_per_second": 4.792, "step": 7380 }, { "epoch": 656.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3436720371246338, "eval_runtime": 2.0283, "eval_samples_per_second": 78.391, "eval_steps_per_second": 4.93, "step": 7391 }, { "epoch": 657.78, "grad_norm": 1.5342937707901, "learning_rate": 5.3106060606060605e-06, "loss": 0.0086, "step": 7400 }, { "epoch": 657.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3466395139694214, "eval_runtime": 2.1077, "eval_samples_per_second": 75.438, "eval_steps_per_second": 4.745, "step": 7402 }, { "epoch": 658.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3453463315963745, "eval_runtime": 2.0776, "eval_samples_per_second": 76.532, "eval_steps_per_second": 4.813, "step": 7413 }, { "epoch": 660.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3420422077178955, "eval_runtime": 2.0546, "eval_samples_per_second": 77.386, "eval_steps_per_second": 4.867, "step": 7425 }, { "epoch": 660.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33706873655319214, "eval_runtime": 2.1267, "eval_samples_per_second": 74.764, "eval_steps_per_second": 4.702, "step": 7436 }, { "epoch": 661.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34426021575927734, "eval_runtime": 2.0904, "eval_samples_per_second": 76.061, "eval_steps_per_second": 4.784, "step": 7447 }, { "epoch": 662.22, "grad_norm": 0.16996954381465912, "learning_rate": 5.121212121212121e-06, "loss": 0.0123, "step": 7450 }, { "epoch": 662.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3473140299320221, "eval_runtime": 2.0509, "eval_samples_per_second": 77.526, "eval_steps_per_second": 4.876, "step": 7458 }, { "epoch": 664.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3424939215183258, "eval_runtime": 2.0641, "eval_samples_per_second": 77.031, "eval_steps_per_second": 4.845, "step": 7470 }, { "epoch": 664.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.345442533493042, "eval_runtime": 2.0612, "eval_samples_per_second": 77.138, "eval_steps_per_second": 4.851, "step": 7481 }, { "epoch": 665.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3493753969669342, "eval_runtime": 1.9848, "eval_samples_per_second": 80.108, "eval_steps_per_second": 5.038, "step": 7492 }, { "epoch": 666.67, "grad_norm": 0.08370883017778397, "learning_rate": 4.931818181818182e-06, "loss": 0.0083, "step": 7500 }, { "epoch": 666.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35356974601745605, "eval_runtime": 2.1097, "eval_samples_per_second": 75.368, "eval_steps_per_second": 4.74, "step": 7503 }, { "epoch": 668.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34760990738868713, "eval_runtime": 2.1147, "eval_samples_per_second": 75.188, "eval_steps_per_second": 4.729, "step": 7515 }, { "epoch": 668.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34870967268943787, "eval_runtime": 2.0331, "eval_samples_per_second": 78.206, "eval_steps_per_second": 4.919, "step": 7526 }, { "epoch": 669.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35328802466392517, "eval_runtime": 2.4514, "eval_samples_per_second": 64.861, "eval_steps_per_second": 4.079, "step": 7537 }, { "epoch": 670.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35539668798446655, "eval_runtime": 2.1199, "eval_samples_per_second": 75.003, "eval_steps_per_second": 4.717, "step": 7548 }, { "epoch": 671.11, "grad_norm": 2.100541353225708, "learning_rate": 4.7424242424242426e-06, "loss": 0.0079, "step": 7550 }, { "epoch": 672.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3482361435890198, "eval_runtime": 2.1456, "eval_samples_per_second": 74.104, "eval_steps_per_second": 4.661, "step": 7560 }, { "epoch": 672.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34814804792404175, "eval_runtime": 2.0856, "eval_samples_per_second": 76.239, "eval_steps_per_second": 4.795, "step": 7571 }, { "epoch": 673.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.344621866941452, "eval_runtime": 2.2762, "eval_samples_per_second": 69.852, "eval_steps_per_second": 4.393, "step": 7582 }, { "epoch": 674.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3432255983352661, "eval_runtime": 2.0507, "eval_samples_per_second": 77.533, "eval_steps_per_second": 4.876, "step": 7593 }, { "epoch": 675.56, "grad_norm": 0.598809003829956, "learning_rate": 4.553030303030303e-06, "loss": 0.0111, "step": 7600 }, { "epoch": 676.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34696489572525024, "eval_runtime": 2.1457, "eval_samples_per_second": 74.102, "eval_steps_per_second": 4.66, "step": 7605 }, { "epoch": 676.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.33925533294677734, "eval_runtime": 2.134, "eval_samples_per_second": 74.507, "eval_steps_per_second": 4.686, "step": 7616 }, { "epoch": 677.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3386417627334595, "eval_runtime": 2.0634, "eval_samples_per_second": 77.059, "eval_steps_per_second": 4.846, "step": 7627 }, { "epoch": 678.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3310278058052063, "eval_runtime": 2.0308, "eval_samples_per_second": 78.293, "eval_steps_per_second": 4.924, "step": 7638 }, { "epoch": 680.0, "grad_norm": 0.0734761655330658, "learning_rate": 4.363636363636364e-06, "loss": 0.0107, "step": 7650 }, { "epoch": 680.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.329887717962265, "eval_runtime": 2.214, "eval_samples_per_second": 71.816, "eval_steps_per_second": 4.517, "step": 7650 }, { "epoch": 680.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33161696791648865, "eval_runtime": 2.0168, "eval_samples_per_second": 78.839, "eval_steps_per_second": 4.958, "step": 7661 }, { "epoch": 681.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33317527174949646, "eval_runtime": 2.1533, "eval_samples_per_second": 73.84, "eval_steps_per_second": 4.644, "step": 7672 }, { "epoch": 682.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3443678021430969, "eval_runtime": 2.1824, "eval_samples_per_second": 72.855, "eval_steps_per_second": 4.582, "step": 7683 }, { "epoch": 684.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3444632291793823, "eval_runtime": 1.9773, "eval_samples_per_second": 80.414, "eval_steps_per_second": 5.058, "step": 7695 }, { "epoch": 684.44, "grad_norm": 1.5188406705856323, "learning_rate": 4.1742424242424246e-06, "loss": 0.0091, "step": 7700 }, { "epoch": 684.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3443754017353058, "eval_runtime": 2.0477, "eval_samples_per_second": 77.647, "eval_steps_per_second": 4.883, "step": 7706 }, { "epoch": 685.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34085437655448914, "eval_runtime": 2.2252, "eval_samples_per_second": 71.453, "eval_steps_per_second": 4.494, "step": 7717 }, { "epoch": 686.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34413453936576843, "eval_runtime": 2.1451, "eval_samples_per_second": 74.121, "eval_steps_per_second": 4.662, "step": 7728 }, { "epoch": 688.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.35173678398132324, "eval_runtime": 2.0413, "eval_samples_per_second": 77.89, "eval_steps_per_second": 4.899, "step": 7740 }, { "epoch": 688.89, "grad_norm": 1.0382517576217651, "learning_rate": 3.984848484848484e-06, "loss": 0.0081, "step": 7750 }, { "epoch": 688.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3521307110786438, "eval_runtime": 2.0937, "eval_samples_per_second": 75.942, "eval_steps_per_second": 4.776, "step": 7751 }, { "epoch": 689.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.350664883852005, "eval_runtime": 2.1003, "eval_samples_per_second": 75.703, "eval_steps_per_second": 4.761, "step": 7762 }, { "epoch": 690.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3460524082183838, "eval_runtime": 2.0791, "eval_samples_per_second": 76.475, "eval_steps_per_second": 4.81, "step": 7773 }, { "epoch": 692.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.349832683801651, "eval_runtime": 2.0457, "eval_samples_per_second": 77.724, "eval_steps_per_second": 4.888, "step": 7785 }, { "epoch": 692.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.35444310307502747, "eval_runtime": 2.1547, "eval_samples_per_second": 73.793, "eval_steps_per_second": 4.641, "step": 7796 }, { "epoch": 693.33, "grad_norm": 0.36742502450942993, "learning_rate": 3.795454545454546e-06, "loss": 0.009, "step": 7800 }, { "epoch": 693.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.35569891333580017, "eval_runtime": 2.0236, "eval_samples_per_second": 78.575, "eval_steps_per_second": 4.942, "step": 7807 }, { "epoch": 694.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.35327550768852234, "eval_runtime": 2.057, "eval_samples_per_second": 77.297, "eval_steps_per_second": 4.861, "step": 7818 }, { "epoch": 696.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3559163212776184, "eval_runtime": 2.203, "eval_samples_per_second": 72.173, "eval_steps_per_second": 4.539, "step": 7830 }, { "epoch": 696.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35951152443885803, "eval_runtime": 2.0835, "eval_samples_per_second": 76.315, "eval_steps_per_second": 4.8, "step": 7841 }, { "epoch": 697.78, "grad_norm": 0.10021142661571503, "learning_rate": 3.606060606060606e-06, "loss": 0.0078, "step": 7850 }, { "epoch": 697.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3617425560951233, "eval_runtime": 2.0937, "eval_samples_per_second": 75.941, "eval_steps_per_second": 4.776, "step": 7852 }, { "epoch": 698.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3614467978477478, "eval_runtime": 2.2589, "eval_samples_per_second": 70.389, "eval_steps_per_second": 4.427, "step": 7863 }, { "epoch": 700.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34519079327583313, "eval_runtime": 2.046, "eval_samples_per_second": 77.712, "eval_steps_per_second": 4.888, "step": 7875 }, { "epoch": 700.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34312644600868225, "eval_runtime": 2.143, "eval_samples_per_second": 74.196, "eval_steps_per_second": 4.666, "step": 7886 }, { "epoch": 701.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34687530994415283, "eval_runtime": 2.1317, "eval_samples_per_second": 74.59, "eval_steps_per_second": 4.691, "step": 7897 }, { "epoch": 702.22, "grad_norm": 0.013305970467627048, "learning_rate": 3.416666666666667e-06, "loss": 0.0102, "step": 7900 }, { "epoch": 702.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3564489483833313, "eval_runtime": 2.0468, "eval_samples_per_second": 77.682, "eval_steps_per_second": 4.886, "step": 7908 }, { "epoch": 704.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35935157537460327, "eval_runtime": 2.0233, "eval_samples_per_second": 78.584, "eval_steps_per_second": 4.942, "step": 7920 }, { "epoch": 704.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3517804443836212, "eval_runtime": 2.2107, "eval_samples_per_second": 71.924, "eval_steps_per_second": 4.524, "step": 7931 }, { "epoch": 705.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3444287180900574, "eval_runtime": 2.0396, "eval_samples_per_second": 77.958, "eval_steps_per_second": 4.903, "step": 7942 }, { "epoch": 706.67, "grad_norm": 1.1949517726898193, "learning_rate": 3.2272727272727275e-06, "loss": 0.008, "step": 7950 }, { "epoch": 706.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34264177083969116, "eval_runtime": 2.0811, "eval_samples_per_second": 76.402, "eval_steps_per_second": 4.805, "step": 7953 }, { "epoch": 708.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34593525528907776, "eval_runtime": 2.1049, "eval_samples_per_second": 75.537, "eval_steps_per_second": 4.751, "step": 7965 }, { "epoch": 708.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3511156439781189, "eval_runtime": 2.0385, "eval_samples_per_second": 77.999, "eval_steps_per_second": 4.906, "step": 7976 }, { "epoch": 709.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.35437288880348206, "eval_runtime": 2.0421, "eval_samples_per_second": 77.862, "eval_steps_per_second": 4.897, "step": 7987 }, { "epoch": 710.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3566732704639435, "eval_runtime": 2.2624, "eval_samples_per_second": 70.28, "eval_steps_per_second": 4.42, "step": 7998 }, { "epoch": 711.11, "grad_norm": 0.8354963660240173, "learning_rate": 3.0378787878787878e-06, "loss": 0.0053, "step": 8000 }, { "epoch": 712.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3673837184906006, "eval_runtime": 2.0161, "eval_samples_per_second": 78.866, "eval_steps_per_second": 4.96, "step": 8010 }, { "epoch": 712.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3630300760269165, "eval_runtime": 2.0691, "eval_samples_per_second": 76.844, "eval_steps_per_second": 4.833, "step": 8021 }, { "epoch": 713.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3602018654346466, "eval_runtime": 2.0814, "eval_samples_per_second": 76.389, "eval_steps_per_second": 4.804, "step": 8032 }, { "epoch": 714.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35657405853271484, "eval_runtime": 2.0547, "eval_samples_per_second": 77.384, "eval_steps_per_second": 4.867, "step": 8043 }, { "epoch": 715.56, "grad_norm": 0.17041368782520294, "learning_rate": 2.8484848484848484e-06, "loss": 0.0071, "step": 8050 }, { "epoch": 716.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3645796477794647, "eval_runtime": 2.0104, "eval_samples_per_second": 79.087, "eval_steps_per_second": 4.974, "step": 8055 }, { "epoch": 716.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.364641398191452, "eval_runtime": 2.0723, "eval_samples_per_second": 76.725, "eval_steps_per_second": 4.825, "step": 8066 }, { "epoch": 717.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3593458831310272, "eval_runtime": 2.017, "eval_samples_per_second": 78.83, "eval_steps_per_second": 4.958, "step": 8077 }, { "epoch": 718.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3625403344631195, "eval_runtime": 2.1034, "eval_samples_per_second": 75.591, "eval_steps_per_second": 4.754, "step": 8088 }, { "epoch": 720.0, "grad_norm": 0.7891609072685242, "learning_rate": 2.659090909090909e-06, "loss": 0.0071, "step": 8100 }, { "epoch": 720.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.36099299788475037, "eval_runtime": 2.0137, "eval_samples_per_second": 78.958, "eval_steps_per_second": 4.966, "step": 8100 }, { "epoch": 720.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.35885581374168396, "eval_runtime": 2.0236, "eval_samples_per_second": 78.572, "eval_steps_per_second": 4.942, "step": 8111 }, { "epoch": 721.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3528722822666168, "eval_runtime": 2.0499, "eval_samples_per_second": 77.565, "eval_steps_per_second": 4.878, "step": 8122 }, { "epoch": 722.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34843915700912476, "eval_runtime": 2.0515, "eval_samples_per_second": 77.504, "eval_steps_per_second": 4.874, "step": 8133 }, { "epoch": 724.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3468559682369232, "eval_runtime": 2.0267, "eval_samples_per_second": 78.452, "eval_steps_per_second": 4.934, "step": 8145 }, { "epoch": 724.44, "grad_norm": 0.013204416260123253, "learning_rate": 2.46969696969697e-06, "loss": 0.0098, "step": 8150 }, { "epoch": 724.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34806957840919495, "eval_runtime": 2.0094, "eval_samples_per_second": 79.126, "eval_steps_per_second": 4.976, "step": 8156 }, { "epoch": 725.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34636813402175903, "eval_runtime": 2.1662, "eval_samples_per_second": 73.4, "eval_steps_per_second": 4.616, "step": 8167 }, { "epoch": 726.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34824779629707336, "eval_runtime": 2.0311, "eval_samples_per_second": 78.282, "eval_steps_per_second": 4.923, "step": 8178 }, { "epoch": 728.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34667864441871643, "eval_runtime": 2.2582, "eval_samples_per_second": 70.411, "eval_steps_per_second": 4.428, "step": 8190 }, { "epoch": 728.89, "grad_norm": 1.7239004373550415, "learning_rate": 2.2803030303030305e-06, "loss": 0.0159, "step": 8200 }, { "epoch": 728.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.346113383769989, "eval_runtime": 2.0824, "eval_samples_per_second": 76.353, "eval_steps_per_second": 4.802, "step": 8201 }, { "epoch": 729.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3437664210796356, "eval_runtime": 2.0394, "eval_samples_per_second": 77.966, "eval_steps_per_second": 4.904, "step": 8212 }, { "epoch": 730.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33936139941215515, "eval_runtime": 2.0701, "eval_samples_per_second": 76.809, "eval_steps_per_second": 4.831, "step": 8223 }, { "epoch": 732.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3355594277381897, "eval_runtime": 2.1359, "eval_samples_per_second": 74.442, "eval_steps_per_second": 4.682, "step": 8235 }, { "epoch": 732.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3355758488178253, "eval_runtime": 2.0241, "eval_samples_per_second": 78.553, "eval_steps_per_second": 4.94, "step": 8246 }, { "epoch": 733.33, "grad_norm": 1.1134917736053467, "learning_rate": 2.090909090909091e-06, "loss": 0.0128, "step": 8250 }, { "epoch": 733.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.337179034948349, "eval_runtime": 2.162, "eval_samples_per_second": 73.543, "eval_steps_per_second": 4.625, "step": 8257 }, { "epoch": 734.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3391708731651306, "eval_runtime": 2.0183, "eval_samples_per_second": 78.778, "eval_steps_per_second": 4.955, "step": 8268 }, { "epoch": 736.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3454706072807312, "eval_runtime": 2.037, "eval_samples_per_second": 78.056, "eval_steps_per_second": 4.909, "step": 8280 }, { "epoch": 736.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34865179657936096, "eval_runtime": 2.1268, "eval_samples_per_second": 74.76, "eval_steps_per_second": 4.702, "step": 8291 }, { "epoch": 737.78, "grad_norm": 0.008208476938307285, "learning_rate": 1.9015151515151518e-06, "loss": 0.0086, "step": 8300 }, { "epoch": 737.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3467850983142853, "eval_runtime": 2.1854, "eval_samples_per_second": 72.756, "eval_steps_per_second": 4.576, "step": 8302 }, { "epoch": 738.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.344488263130188, "eval_runtime": 2.0623, "eval_samples_per_second": 77.099, "eval_steps_per_second": 4.849, "step": 8313 }, { "epoch": 740.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.34248578548431396, "eval_runtime": 2.0582, "eval_samples_per_second": 77.254, "eval_steps_per_second": 4.859, "step": 8325 }, { "epoch": 740.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3452531397342682, "eval_runtime": 2.1556, "eval_samples_per_second": 73.762, "eval_steps_per_second": 4.639, "step": 8336 }, { "epoch": 741.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34475868940353394, "eval_runtime": 2.0516, "eval_samples_per_second": 77.5, "eval_steps_per_second": 4.874, "step": 8347 }, { "epoch": 742.22, "grad_norm": 0.2444353848695755, "learning_rate": 1.712121212121212e-06, "loss": 0.011, "step": 8350 }, { "epoch": 742.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34120240807533264, "eval_runtime": 2.0936, "eval_samples_per_second": 75.945, "eval_steps_per_second": 4.776, "step": 8358 }, { "epoch": 744.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.33924660086631775, "eval_runtime": 2.2099, "eval_samples_per_second": 71.948, "eval_steps_per_second": 4.525, "step": 8370 }, { "epoch": 744.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3390309512615204, "eval_runtime": 1.9925, "eval_samples_per_second": 79.801, "eval_steps_per_second": 5.019, "step": 8381 }, { "epoch": 745.96, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3395291268825531, "eval_runtime": 1.9807, "eval_samples_per_second": 80.274, "eval_steps_per_second": 5.049, "step": 8392 }, { "epoch": 746.67, "grad_norm": 0.8103430867195129, "learning_rate": 1.5227272727272727e-06, "loss": 0.0074, "step": 8400 }, { "epoch": 746.93, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3383350074291229, "eval_runtime": 2.2625, "eval_samples_per_second": 70.276, "eval_steps_per_second": 4.42, "step": 8403 }, { "epoch": 748.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33776676654815674, "eval_runtime": 2.0087, "eval_samples_per_second": 79.157, "eval_steps_per_second": 4.978, "step": 8415 }, { "epoch": 748.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3348415195941925, "eval_runtime": 2.0796, "eval_samples_per_second": 76.457, "eval_steps_per_second": 4.809, "step": 8426 }, { "epoch": 749.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33347979187965393, "eval_runtime": 2.1871, "eval_samples_per_second": 72.698, "eval_steps_per_second": 4.572, "step": 8437 }, { "epoch": 750.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33422428369522095, "eval_runtime": 2.069, "eval_samples_per_second": 76.849, "eval_steps_per_second": 4.833, "step": 8448 }, { "epoch": 751.11, "grad_norm": 1.5617446899414062, "learning_rate": 1.3333333333333334e-06, "loss": 0.0087, "step": 8450 }, { "epoch": 752.0, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33466464281082153, "eval_runtime": 2.0175, "eval_samples_per_second": 78.81, "eval_steps_per_second": 4.957, "step": 8460 }, { "epoch": 752.98, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.33632901310920715, "eval_runtime": 2.2613, "eval_samples_per_second": 70.315, "eval_steps_per_second": 4.422, "step": 8471 }, { "epoch": 753.96, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3377835154533386, "eval_runtime": 2.0093, "eval_samples_per_second": 79.131, "eval_steps_per_second": 4.977, "step": 8482 }, { "epoch": 754.93, "eval_accuracy": 0.9433962264150944, "eval_loss": 0.3383637070655823, "eval_runtime": 2.0348, "eval_samples_per_second": 78.139, "eval_steps_per_second": 4.914, "step": 8493 }, { "epoch": 755.56, "grad_norm": 1.2671109437942505, "learning_rate": 1.143939393939394e-06, "loss": 0.0061, "step": 8500 }, { "epoch": 756.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3406466245651245, "eval_runtime": 2.0595, "eval_samples_per_second": 77.203, "eval_steps_per_second": 4.856, "step": 8505 }, { "epoch": 756.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.34400761127471924, "eval_runtime": 1.9798, "eval_samples_per_second": 80.313, "eval_steps_per_second": 5.051, "step": 8516 }, { "epoch": 757.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34409239888191223, "eval_runtime": 2.0569, "eval_samples_per_second": 77.301, "eval_steps_per_second": 4.862, "step": 8527 }, { "epoch": 758.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34241315722465515, "eval_runtime": 2.0733, "eval_samples_per_second": 76.691, "eval_steps_per_second": 4.823, "step": 8538 }, { "epoch": 760.0, "grad_norm": 2.0512726306915283, "learning_rate": 9.545454545454546e-07, "loss": 0.0119, "step": 8550 }, { "epoch": 760.0, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3426421582698822, "eval_runtime": 2.0315, "eval_samples_per_second": 78.268, "eval_steps_per_second": 4.922, "step": 8550 }, { "epoch": 760.98, "eval_accuracy": 0.9371069182389937, "eval_loss": 0.3427829444408417, "eval_runtime": 2.1633, "eval_samples_per_second": 73.499, "eval_steps_per_second": 4.623, "step": 8561 }, { "epoch": 761.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34399789571762085, "eval_runtime": 2.1363, "eval_samples_per_second": 74.428, "eval_steps_per_second": 4.681, "step": 8572 }, { "epoch": 762.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3443286418914795, "eval_runtime": 2.0533, "eval_samples_per_second": 77.437, "eval_steps_per_second": 4.87, "step": 8583 }, { "epoch": 764.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.345469206571579, "eval_runtime": 1.9651, "eval_samples_per_second": 80.911, "eval_steps_per_second": 5.089, "step": 8595 }, { "epoch": 764.44, "grad_norm": 0.15614187717437744, "learning_rate": 7.651515151515152e-07, "loss": 0.0056, "step": 8600 }, { "epoch": 764.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34602606296539307, "eval_runtime": 2.0712, "eval_samples_per_second": 76.769, "eval_steps_per_second": 4.828, "step": 8606 }, { "epoch": 765.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3463137745857239, "eval_runtime": 1.9634, "eval_samples_per_second": 80.983, "eval_steps_per_second": 5.093, "step": 8617 }, { "epoch": 766.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34662124514579773, "eval_runtime": 2.0264, "eval_samples_per_second": 78.466, "eval_steps_per_second": 4.935, "step": 8628 }, { "epoch": 768.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3465888202190399, "eval_runtime": 2.1276, "eval_samples_per_second": 74.732, "eval_steps_per_second": 4.7, "step": 8640 }, { "epoch": 768.89, "grad_norm": 0.13273529708385468, "learning_rate": 5.757575757575757e-07, "loss": 0.0094, "step": 8650 }, { "epoch": 768.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34740516543388367, "eval_runtime": 1.986, "eval_samples_per_second": 80.062, "eval_steps_per_second": 5.035, "step": 8651 }, { "epoch": 769.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3476426601409912, "eval_runtime": 2.2993, "eval_samples_per_second": 69.152, "eval_steps_per_second": 4.349, "step": 8662 }, { "epoch": 770.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34822559356689453, "eval_runtime": 2.054, "eval_samples_per_second": 77.411, "eval_steps_per_second": 4.869, "step": 8673 }, { "epoch": 772.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.348609060049057, "eval_runtime": 2.0775, "eval_samples_per_second": 76.533, "eval_steps_per_second": 4.813, "step": 8685 }, { "epoch": 772.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34849491715431213, "eval_runtime": 1.9848, "eval_samples_per_second": 80.11, "eval_steps_per_second": 5.038, "step": 8696 }, { "epoch": 773.33, "grad_norm": 2.092862606048584, "learning_rate": 3.8636363636363636e-07, "loss": 0.014, "step": 8700 }, { "epoch": 773.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3478315770626068, "eval_runtime": 2.0318, "eval_samples_per_second": 78.257, "eval_steps_per_second": 4.922, "step": 8707 }, { "epoch": 774.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.347221702337265, "eval_runtime": 2.0723, "eval_samples_per_second": 76.726, "eval_steps_per_second": 4.826, "step": 8718 }, { "epoch": 776.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34651002287864685, "eval_runtime": 1.9895, "eval_samples_per_second": 79.92, "eval_steps_per_second": 5.026, "step": 8730 }, { "epoch": 776.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3461478352546692, "eval_runtime": 2.0438, "eval_samples_per_second": 77.796, "eval_steps_per_second": 4.893, "step": 8741 }, { "epoch": 777.78, "grad_norm": 0.42866629362106323, "learning_rate": 1.9696969696969696e-07, "loss": 0.0126, "step": 8750 }, { "epoch": 777.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3467194736003876, "eval_runtime": 2.0767, "eval_samples_per_second": 76.564, "eval_steps_per_second": 4.815, "step": 8752 }, { "epoch": 778.93, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3471050262451172, "eval_runtime": 2.0846, "eval_samples_per_second": 76.272, "eval_steps_per_second": 4.797, "step": 8763 }, { "epoch": 780.0, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.34714454412460327, "eval_runtime": 2.1516, "eval_samples_per_second": 73.897, "eval_steps_per_second": 4.648, "step": 8775 }, { "epoch": 780.98, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3471665382385254, "eval_runtime": 2.0781, "eval_samples_per_second": 76.511, "eval_steps_per_second": 4.812, "step": 8786 }, { "epoch": 781.96, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3471100628376007, "eval_runtime": 2.0029, "eval_samples_per_second": 79.386, "eval_steps_per_second": 4.993, "step": 8797 }, { "epoch": 782.22, "grad_norm": 0.060126595199108124, "learning_rate": 7.575757575757576e-09, "loss": 0.0048, "step": 8800 }, { "epoch": 782.22, "eval_accuracy": 0.9308176100628931, "eval_loss": 0.3471885025501251, "eval_runtime": 2.0337, "eval_samples_per_second": 78.181, "eval_steps_per_second": 4.917, "step": 8800 }, { "epoch": 782.22, "step": 8800, "total_flos": 4.912188447589224e+18, "train_loss": 0.0709631282125007, "train_runtime": 5794.2307, "train_samples_per_second": 98.995, "train_steps_per_second": 1.519 } ], "logging_steps": 50, "max_steps": 8800, "num_input_tokens_seen": 0, "num_train_epochs": 800, "save_steps": 500, "total_flos": 4.912188447589224e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }