|
{ |
|
"best_metric": 1.0, |
|
"best_model_checkpoint": "resnet-50-finetuned-student_kaggle/checkpoint-423", |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 940, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 54.35947799682617, |
|
"learning_rate": 5.319148936170213e-06, |
|
"loss": 0.9341, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 34.59556579589844, |
|
"learning_rate": 1.0638297872340426e-05, |
|
"loss": 0.9157, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 42.179847717285156, |
|
"learning_rate": 1.595744680851064e-05, |
|
"loss": 0.7801, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 30.099029541015625, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 0.7142, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.610062893081761, |
|
"eval_loss": 0.6418222188949585, |
|
"eval_runtime": 7.6299, |
|
"eval_samples_per_second": 83.356, |
|
"eval_steps_per_second": 2.621, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 48.046546936035156, |
|
"learning_rate": 2.6595744680851064e-05, |
|
"loss": 0.7114, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 45.94879913330078, |
|
"learning_rate": 3.191489361702128e-05, |
|
"loss": 0.6014, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 17.69209861755371, |
|
"learning_rate": 3.723404255319149e-05, |
|
"loss": 0.4815, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 18.821670532226562, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 0.463, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 23.751588821411133, |
|
"learning_rate": 4.787234042553192e-05, |
|
"loss": 0.3351, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8946540880503144, |
|
"eval_loss": 0.25965991616249084, |
|
"eval_runtime": 7.6659, |
|
"eval_samples_per_second": 82.964, |
|
"eval_steps_per_second": 2.609, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 18.11069679260254, |
|
"learning_rate": 4.964539007092199e-05, |
|
"loss": 0.3193, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 12.397391319274902, |
|
"learning_rate": 4.905437352245863e-05, |
|
"loss": 0.2768, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 16.857635498046875, |
|
"learning_rate": 4.846335697399527e-05, |
|
"loss": 0.2594, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 12.635449409484863, |
|
"learning_rate": 4.787234042553192e-05, |
|
"loss": 0.2063, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 15.277303695678711, |
|
"learning_rate": 4.728132387706856e-05, |
|
"loss": 0.2574, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9779874213836478, |
|
"eval_loss": 0.10460298508405685, |
|
"eval_runtime": 8.3391, |
|
"eval_samples_per_second": 76.267, |
|
"eval_steps_per_second": 2.398, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 3.1914893617021276, |
|
"grad_norm": 14.497098922729492, |
|
"learning_rate": 4.669030732860521e-05, |
|
"loss": 0.2349, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.404255319148936, |
|
"grad_norm": 17.647092819213867, |
|
"learning_rate": 4.609929078014185e-05, |
|
"loss": 0.1631, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.617021276595745, |
|
"grad_norm": 12.856146812438965, |
|
"learning_rate": 4.550827423167849e-05, |
|
"loss": 0.1675, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.829787234042553, |
|
"grad_norm": 7.248583793640137, |
|
"learning_rate": 4.491725768321513e-05, |
|
"loss": 0.1479, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9874213836477987, |
|
"eval_loss": 0.061614990234375, |
|
"eval_runtime": 8.4097, |
|
"eval_samples_per_second": 75.627, |
|
"eval_steps_per_second": 2.378, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 4.042553191489362, |
|
"grad_norm": 25.721847534179688, |
|
"learning_rate": 4.432624113475177e-05, |
|
"loss": 0.1528, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.25531914893617, |
|
"grad_norm": 6.252942085266113, |
|
"learning_rate": 4.373522458628842e-05, |
|
"loss": 0.145, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.468085106382979, |
|
"grad_norm": 6.672601222991943, |
|
"learning_rate": 4.3144208037825064e-05, |
|
"loss": 0.1247, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.680851063829787, |
|
"grad_norm": 20.4866886138916, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 0.1405, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.8936170212765955, |
|
"grad_norm": 19.644893646240234, |
|
"learning_rate": 4.1962174940898345e-05, |
|
"loss": 0.1284, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9952830188679245, |
|
"eval_loss": 0.02317511849105358, |
|
"eval_runtime": 10.3441, |
|
"eval_samples_per_second": 61.485, |
|
"eval_steps_per_second": 1.933, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 5.1063829787234045, |
|
"grad_norm": 10.905556678771973, |
|
"learning_rate": 4.1371158392434986e-05, |
|
"loss": 0.1178, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.319148936170213, |
|
"grad_norm": 11.02078628540039, |
|
"learning_rate": 4.078014184397163e-05, |
|
"loss": 0.1176, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.531914893617021, |
|
"grad_norm": 7.510810375213623, |
|
"learning_rate": 4.018912529550828e-05, |
|
"loss": 0.0881, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.74468085106383, |
|
"grad_norm": 2.0541610717773438, |
|
"learning_rate": 3.959810874704492e-05, |
|
"loss": 0.1274, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.957446808510638, |
|
"grad_norm": 7.680713653564453, |
|
"learning_rate": 3.900709219858156e-05, |
|
"loss": 0.077, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9952830188679245, |
|
"eval_loss": 0.015012426301836967, |
|
"eval_runtime": 8.4118, |
|
"eval_samples_per_second": 75.608, |
|
"eval_steps_per_second": 2.378, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 6.170212765957447, |
|
"grad_norm": 8.117420196533203, |
|
"learning_rate": 3.84160756501182e-05, |
|
"loss": 0.172, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.382978723404255, |
|
"grad_norm": 23.871868133544922, |
|
"learning_rate": 3.782505910165485e-05, |
|
"loss": 0.0613, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.595744680851064, |
|
"grad_norm": 15.407998085021973, |
|
"learning_rate": 3.723404255319149e-05, |
|
"loss": 0.1287, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.808510638297872, |
|
"grad_norm": 6.940992832183838, |
|
"learning_rate": 3.664302600472813e-05, |
|
"loss": 0.103, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9984276729559748, |
|
"eval_loss": 0.010532047599554062, |
|
"eval_runtime": 7.9689, |
|
"eval_samples_per_second": 79.81, |
|
"eval_steps_per_second": 2.51, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 7.0212765957446805, |
|
"grad_norm": 4.598968029022217, |
|
"learning_rate": 3.605200945626478e-05, |
|
"loss": 0.0906, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 7.23404255319149, |
|
"grad_norm": 7.53684663772583, |
|
"learning_rate": 3.546099290780142e-05, |
|
"loss": 0.0792, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.446808510638298, |
|
"grad_norm": 2.750072479248047, |
|
"learning_rate": 3.4869976359338065e-05, |
|
"loss": 0.091, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.659574468085106, |
|
"grad_norm": 4.067008018493652, |
|
"learning_rate": 3.4278959810874706e-05, |
|
"loss": 0.0777, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.872340425531915, |
|
"grad_norm": 15.093037605285645, |
|
"learning_rate": 3.3687943262411347e-05, |
|
"loss": 0.0922, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9984276729559748, |
|
"eval_loss": 0.009353628382086754, |
|
"eval_runtime": 7.7264, |
|
"eval_samples_per_second": 82.315, |
|
"eval_steps_per_second": 2.589, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 8.085106382978724, |
|
"grad_norm": 8.675619125366211, |
|
"learning_rate": 3.309692671394799e-05, |
|
"loss": 0.1211, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 8.297872340425531, |
|
"grad_norm": 5.723608493804932, |
|
"learning_rate": 3.2505910165484634e-05, |
|
"loss": 0.0645, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.51063829787234, |
|
"grad_norm": 8.031245231628418, |
|
"learning_rate": 3.191489361702128e-05, |
|
"loss": 0.0787, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.72340425531915, |
|
"grad_norm": 2.483238935470581, |
|
"learning_rate": 3.132387706855792e-05, |
|
"loss": 0.0683, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.936170212765958, |
|
"grad_norm": 7.612273216247559, |
|
"learning_rate": 3.073286052009456e-05, |
|
"loss": 0.08, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.00555912172421813, |
|
"eval_runtime": 8.4427, |
|
"eval_samples_per_second": 75.331, |
|
"eval_steps_per_second": 2.369, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 9.148936170212766, |
|
"grad_norm": 2.0842654705047607, |
|
"learning_rate": 3.0141843971631207e-05, |
|
"loss": 0.0812, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 9.361702127659575, |
|
"grad_norm": 17.500883102416992, |
|
"learning_rate": 2.9550827423167847e-05, |
|
"loss": 0.0688, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.574468085106384, |
|
"grad_norm": 9.671988487243652, |
|
"learning_rate": 2.895981087470449e-05, |
|
"loss": 0.0837, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.787234042553191, |
|
"grad_norm": 13.061412811279297, |
|
"learning_rate": 2.836879432624114e-05, |
|
"loss": 0.1024, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 11.679757118225098, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.0492, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.004496446345001459, |
|
"eval_runtime": 9.0183, |
|
"eval_samples_per_second": 70.523, |
|
"eval_steps_per_second": 2.218, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 10.212765957446809, |
|
"grad_norm": 3.253716230392456, |
|
"learning_rate": 2.7186761229314423e-05, |
|
"loss": 0.0495, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 10.425531914893616, |
|
"grad_norm": 5.826026916503906, |
|
"learning_rate": 2.6595744680851064e-05, |
|
"loss": 0.0266, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 10.638297872340425, |
|
"grad_norm": 9.738965034484863, |
|
"learning_rate": 2.6004728132387708e-05, |
|
"loss": 0.0739, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.851063829787234, |
|
"grad_norm": 9.04916763305664, |
|
"learning_rate": 2.5413711583924348e-05, |
|
"loss": 0.0574, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.004308629781007767, |
|
"eval_runtime": 8.4135, |
|
"eval_samples_per_second": 75.593, |
|
"eval_steps_per_second": 2.377, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 11.063829787234043, |
|
"grad_norm": 11.041916847229004, |
|
"learning_rate": 2.4822695035460995e-05, |
|
"loss": 0.0977, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 11.27659574468085, |
|
"grad_norm": 31.96723175048828, |
|
"learning_rate": 2.4231678486997636e-05, |
|
"loss": 0.0541, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 11.48936170212766, |
|
"grad_norm": 2.877957344055176, |
|
"learning_rate": 2.364066193853428e-05, |
|
"loss": 0.0401, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.702127659574469, |
|
"grad_norm": 3.020596742630005, |
|
"learning_rate": 2.3049645390070924e-05, |
|
"loss": 0.0284, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 11.914893617021276, |
|
"grad_norm": 7.811126232147217, |
|
"learning_rate": 2.2458628841607564e-05, |
|
"loss": 0.0382, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0022806336637586355, |
|
"eval_runtime": 7.7232, |
|
"eval_samples_per_second": 82.35, |
|
"eval_steps_per_second": 2.59, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 12.127659574468085, |
|
"grad_norm": 11.633199691772461, |
|
"learning_rate": 2.186761229314421e-05, |
|
"loss": 0.0589, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 12.340425531914894, |
|
"grad_norm": 0.3253759443759918, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 0.041, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 12.553191489361701, |
|
"grad_norm": 2.2859044075012207, |
|
"learning_rate": 2.0685579196217493e-05, |
|
"loss": 0.0578, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 12.76595744680851, |
|
"grad_norm": 1.5324259996414185, |
|
"learning_rate": 2.009456264775414e-05, |
|
"loss": 0.0312, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.97872340425532, |
|
"grad_norm": 6.985143661499023, |
|
"learning_rate": 1.950354609929078e-05, |
|
"loss": 0.0666, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0022491966374218464, |
|
"eval_runtime": 8.1081, |
|
"eval_samples_per_second": 78.44, |
|
"eval_steps_per_second": 2.467, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 13.191489361702128, |
|
"grad_norm": 8.847366333007812, |
|
"learning_rate": 1.8912529550827425e-05, |
|
"loss": 0.0539, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 13.404255319148936, |
|
"grad_norm": 10.476814270019531, |
|
"learning_rate": 1.8321513002364065e-05, |
|
"loss": 0.0369, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 13.617021276595745, |
|
"grad_norm": 5.339621067047119, |
|
"learning_rate": 1.773049645390071e-05, |
|
"loss": 0.0308, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.829787234042554, |
|
"grad_norm": 14.648975372314453, |
|
"learning_rate": 1.7139479905437353e-05, |
|
"loss": 0.0477, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0021932125091552734, |
|
"eval_runtime": 8.4493, |
|
"eval_samples_per_second": 75.272, |
|
"eval_steps_per_second": 2.367, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 14.042553191489361, |
|
"grad_norm": 5.510159969329834, |
|
"learning_rate": 1.6548463356973994e-05, |
|
"loss": 0.028, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 14.25531914893617, |
|
"grad_norm": 5.803068161010742, |
|
"learning_rate": 1.595744680851064e-05, |
|
"loss": 0.0522, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 14.46808510638298, |
|
"grad_norm": 1.1623107194900513, |
|
"learning_rate": 1.536643026004728e-05, |
|
"loss": 0.0481, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 14.680851063829786, |
|
"grad_norm": 12.495600700378418, |
|
"learning_rate": 1.4775413711583924e-05, |
|
"loss": 0.0588, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 14.893617021276595, |
|
"grad_norm": 3.4236888885498047, |
|
"learning_rate": 1.418439716312057e-05, |
|
"loss": 0.0614, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.002270177938044071, |
|
"eval_runtime": 8.5563, |
|
"eval_samples_per_second": 74.331, |
|
"eval_steps_per_second": 2.337, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 15.106382978723405, |
|
"grad_norm": 11.681058883666992, |
|
"learning_rate": 1.3593380614657212e-05, |
|
"loss": 0.0674, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 15.319148936170214, |
|
"grad_norm": 1.846946120262146, |
|
"learning_rate": 1.3002364066193854e-05, |
|
"loss": 0.0415, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 15.53191489361702, |
|
"grad_norm": 8.939858436584473, |
|
"learning_rate": 1.2411347517730498e-05, |
|
"loss": 0.0189, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 15.74468085106383, |
|
"grad_norm": 3.521784782409668, |
|
"learning_rate": 1.182033096926714e-05, |
|
"loss": 0.0585, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 15.957446808510639, |
|
"grad_norm": 1.9891993999481201, |
|
"learning_rate": 1.1229314420803782e-05, |
|
"loss": 0.0282, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0013930280692875385, |
|
"eval_runtime": 8.1789, |
|
"eval_samples_per_second": 77.761, |
|
"eval_steps_per_second": 2.445, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 16.170212765957448, |
|
"grad_norm": 7.0705246925354, |
|
"learning_rate": 1.0638297872340426e-05, |
|
"loss": 0.0508, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 16.382978723404257, |
|
"grad_norm": 11.365514755249023, |
|
"learning_rate": 1.004728132387707e-05, |
|
"loss": 0.0393, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 16.595744680851062, |
|
"grad_norm": 8.82397747039795, |
|
"learning_rate": 9.456264775413712e-06, |
|
"loss": 0.0509, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 16.80851063829787, |
|
"grad_norm": 5.013731002807617, |
|
"learning_rate": 8.865248226950355e-06, |
|
"loss": 0.0659, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0016287014586851, |
|
"eval_runtime": 7.6703, |
|
"eval_samples_per_second": 82.917, |
|
"eval_steps_per_second": 2.607, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 17.02127659574468, |
|
"grad_norm": 2.8596644401550293, |
|
"learning_rate": 8.274231678486997e-06, |
|
"loss": 0.0285, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 17.23404255319149, |
|
"grad_norm": 10.184608459472656, |
|
"learning_rate": 7.68321513002364e-06, |
|
"loss": 0.062, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 17.4468085106383, |
|
"grad_norm": 6.029819011688232, |
|
"learning_rate": 7.092198581560285e-06, |
|
"loss": 0.0672, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 17.659574468085108, |
|
"grad_norm": 0.9212875366210938, |
|
"learning_rate": 6.501182033096927e-06, |
|
"loss": 0.0404, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 17.872340425531917, |
|
"grad_norm": 0.5147794485092163, |
|
"learning_rate": 5.91016548463357e-06, |
|
"loss": 0.0586, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0009691208251751959, |
|
"eval_runtime": 8.4381, |
|
"eval_samples_per_second": 75.373, |
|
"eval_steps_per_second": 2.37, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 18.085106382978722, |
|
"grad_norm": 3.351142406463623, |
|
"learning_rate": 5.319148936170213e-06, |
|
"loss": 0.0333, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 18.29787234042553, |
|
"grad_norm": 7.570976257324219, |
|
"learning_rate": 4.728132387706856e-06, |
|
"loss": 0.0401, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 18.51063829787234, |
|
"grad_norm": 6.660007953643799, |
|
"learning_rate": 4.137115839243498e-06, |
|
"loss": 0.0329, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 18.72340425531915, |
|
"grad_norm": 11.373592376708984, |
|
"learning_rate": 3.5460992907801423e-06, |
|
"loss": 0.0523, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 18.93617021276596, |
|
"grad_norm": 4.553757667541504, |
|
"learning_rate": 2.955082742316785e-06, |
|
"loss": 0.0557, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0012750416062772274, |
|
"eval_runtime": 8.3957, |
|
"eval_samples_per_second": 75.753, |
|
"eval_steps_per_second": 2.382, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 19.148936170212767, |
|
"grad_norm": 1.8293001651763916, |
|
"learning_rate": 2.364066193853428e-06, |
|
"loss": 0.0248, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 19.361702127659573, |
|
"grad_norm": 3.5974695682525635, |
|
"learning_rate": 1.7730496453900712e-06, |
|
"loss": 0.0298, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 19.574468085106382, |
|
"grad_norm": 4.631837844848633, |
|
"learning_rate": 1.182033096926714e-06, |
|
"loss": 0.0272, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 19.78723404255319, |
|
"grad_norm": 1.5552431344985962, |
|
"learning_rate": 5.91016548463357e-07, |
|
"loss": 0.0281, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 5.388515949249268, |
|
"learning_rate": 0.0, |
|
"loss": 0.07, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.001178326434455812, |
|
"eval_runtime": 8.4555, |
|
"eval_samples_per_second": 75.217, |
|
"eval_steps_per_second": 2.365, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 940, |
|
"total_flos": 6.302667737382912e+17, |
|
"train_loss": 0.13929352825309368, |
|
"train_runtime": 703.6937, |
|
"train_samples_per_second": 42.177, |
|
"train_steps_per_second": 1.336 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 940, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 6.302667737382912e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|