|
{ |
|
"best_metric": 0.8585118376550169, |
|
"best_model_checkpoint": "swinv2-tiny-patch4-window8-256-finetuned-galaxy10-decals/checkpoint-2480", |
|
"epoch": 19.879759519038075, |
|
"eval_steps": 500, |
|
"global_step": 2480, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 3.107008457183838, |
|
"learning_rate": 2.0161290322580646e-06, |
|
"loss": 2.3373, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16032064128256512, |
|
"grad_norm": 3.129793167114258, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 2.2991, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24048096192384769, |
|
"grad_norm": 3.2072296142578125, |
|
"learning_rate": 6.048387096774194e-06, |
|
"loss": 2.2401, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32064128256513025, |
|
"grad_norm": 3.172947406768799, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 2.1683, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.40080160320641284, |
|
"grad_norm": 4.017691612243652, |
|
"learning_rate": 1.0080645161290323e-05, |
|
"loss": 2.0858, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48096192384769537, |
|
"grad_norm": 4.591518402099609, |
|
"learning_rate": 1.2096774193548388e-05, |
|
"loss": 1.9855, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.561122244488978, |
|
"grad_norm": 6.409041404724121, |
|
"learning_rate": 1.4112903225806454e-05, |
|
"loss": 1.8467, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6412825651302605, |
|
"grad_norm": 7.812229633331299, |
|
"learning_rate": 1.6129032258064517e-05, |
|
"loss": 1.7127, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7214428857715431, |
|
"grad_norm": 9.216848373413086, |
|
"learning_rate": 1.8145161290322583e-05, |
|
"loss": 1.551, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8016032064128257, |
|
"grad_norm": 7.334836483001709, |
|
"learning_rate": 2.0161290322580645e-05, |
|
"loss": 1.4516, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8817635270541082, |
|
"grad_norm": 7.710146427154541, |
|
"learning_rate": 2.217741935483871e-05, |
|
"loss": 1.3589, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9619238476953907, |
|
"grad_norm": 11.563941955566406, |
|
"learning_rate": 2.4193548387096777e-05, |
|
"loss": 1.318, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9939879759519038, |
|
"eval_accuracy": 0.6358511837655016, |
|
"eval_loss": 1.0409355163574219, |
|
"eval_runtime": 18.6446, |
|
"eval_samples_per_second": 95.148, |
|
"eval_steps_per_second": 3.004, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.0420841683366733, |
|
"grad_norm": 9.838645935058594, |
|
"learning_rate": 2.620967741935484e-05, |
|
"loss": 1.2543, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.122244488977956, |
|
"grad_norm": 9.036652565002441, |
|
"learning_rate": 2.822580645161291e-05, |
|
"loss": 1.2318, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2024048096192386, |
|
"grad_norm": 7.428995132446289, |
|
"learning_rate": 3.024193548387097e-05, |
|
"loss": 1.2297, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.282565130260521, |
|
"grad_norm": 7.845966815948486, |
|
"learning_rate": 3.2258064516129034e-05, |
|
"loss": 1.1232, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3627254509018036, |
|
"grad_norm": 7.774607181549072, |
|
"learning_rate": 3.427419354838709e-05, |
|
"loss": 1.0894, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4428857715430863, |
|
"grad_norm": 12.192733764648438, |
|
"learning_rate": 3.6290322580645165e-05, |
|
"loss": 1.0552, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5230460921843687, |
|
"grad_norm": 7.907654762268066, |
|
"learning_rate": 3.8306451612903224e-05, |
|
"loss": 1.0559, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6032064128256514, |
|
"grad_norm": 8.524667739868164, |
|
"learning_rate": 4.032258064516129e-05, |
|
"loss": 1.0315, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6833667334669338, |
|
"grad_norm": 9.733327865600586, |
|
"learning_rate": 4.2338709677419356e-05, |
|
"loss": 1.0234, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.7635270541082164, |
|
"grad_norm": 8.974815368652344, |
|
"learning_rate": 4.435483870967742e-05, |
|
"loss": 0.9642, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.843687374749499, |
|
"grad_norm": 10.060734748840332, |
|
"learning_rate": 4.637096774193548e-05, |
|
"loss": 0.9358, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9238476953907817, |
|
"grad_norm": 10.902128219604492, |
|
"learning_rate": 4.8387096774193554e-05, |
|
"loss": 0.9268, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9959919839679359, |
|
"eval_accuracy": 0.7497181510710259, |
|
"eval_loss": 0.7163556814193726, |
|
"eval_runtime": 15.1868, |
|
"eval_samples_per_second": 116.812, |
|
"eval_steps_per_second": 3.687, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.004008016032064, |
|
"grad_norm": 8.283767700195312, |
|
"learning_rate": 4.995519713261649e-05, |
|
"loss": 0.915, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0841683366733466, |
|
"grad_norm": 9.61968994140625, |
|
"learning_rate": 4.973118279569893e-05, |
|
"loss": 0.8681, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.164328657314629, |
|
"grad_norm": 8.20541763305664, |
|
"learning_rate": 4.950716845878137e-05, |
|
"loss": 0.9177, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.244488977955912, |
|
"grad_norm": 6.9433369636535645, |
|
"learning_rate": 4.92831541218638e-05, |
|
"loss": 0.8946, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.3246492985971945, |
|
"grad_norm": 10.144634246826172, |
|
"learning_rate": 4.905913978494624e-05, |
|
"loss": 0.8933, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.404809619238477, |
|
"grad_norm": 12.519510269165039, |
|
"learning_rate": 4.8835125448028677e-05, |
|
"loss": 0.8515, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4849699398797593, |
|
"grad_norm": 7.839648246765137, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 0.9103, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.565130260521042, |
|
"grad_norm": 9.056415557861328, |
|
"learning_rate": 4.8387096774193554e-05, |
|
"loss": 0.8601, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.6452905811623246, |
|
"grad_norm": 10.264932632446289, |
|
"learning_rate": 4.8163082437275986e-05, |
|
"loss": 0.8363, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7254509018036073, |
|
"grad_norm": 8.888627052307129, |
|
"learning_rate": 4.7939068100358424e-05, |
|
"loss": 0.8075, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.80561122244489, |
|
"grad_norm": 10.737099647521973, |
|
"learning_rate": 4.771505376344086e-05, |
|
"loss": 0.9107, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8857715430861726, |
|
"grad_norm": 7.076110363006592, |
|
"learning_rate": 4.74910394265233e-05, |
|
"loss": 0.7776, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.9659318637274548, |
|
"grad_norm": 7.52149772644043, |
|
"learning_rate": 4.726702508960574e-05, |
|
"loss": 0.8221, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.997995991983968, |
|
"eval_accuracy": 0.7874859075535513, |
|
"eval_loss": 0.6210038065910339, |
|
"eval_runtime": 16.893, |
|
"eval_samples_per_second": 105.014, |
|
"eval_steps_per_second": 3.315, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.0460921843687374, |
|
"grad_norm": 10.58385181427002, |
|
"learning_rate": 4.704301075268818e-05, |
|
"loss": 0.8115, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.12625250501002, |
|
"grad_norm": 11.055846214294434, |
|
"learning_rate": 4.681899641577061e-05, |
|
"loss": 0.7765, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.2064128256513027, |
|
"grad_norm": 8.763452529907227, |
|
"learning_rate": 4.659498207885305e-05, |
|
"loss": 0.7307, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2865731462925853, |
|
"grad_norm": 6.804797172546387, |
|
"learning_rate": 4.637096774193548e-05, |
|
"loss": 0.7876, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.3667334669338675, |
|
"grad_norm": 6.112203121185303, |
|
"learning_rate": 4.614695340501792e-05, |
|
"loss": 0.7391, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.44689378757515, |
|
"grad_norm": 8.772920608520508, |
|
"learning_rate": 4.5922939068100365e-05, |
|
"loss": 0.7453, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.527054108216433, |
|
"grad_norm": 5.974344253540039, |
|
"learning_rate": 4.56989247311828e-05, |
|
"loss": 0.7345, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.6072144288577155, |
|
"grad_norm": 8.748202323913574, |
|
"learning_rate": 4.5474910394265236e-05, |
|
"loss": 0.8431, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.687374749498998, |
|
"grad_norm": 8.03186321258545, |
|
"learning_rate": 4.5250896057347674e-05, |
|
"loss": 0.7337, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.7675350701402808, |
|
"grad_norm": 7.393523216247559, |
|
"learning_rate": 4.5026881720430106e-05, |
|
"loss": 0.7451, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.847695390781563, |
|
"grad_norm": 6.694340705871582, |
|
"learning_rate": 4.4802867383512545e-05, |
|
"loss": 0.8017, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.9278557114228456, |
|
"grad_norm": 6.493546962738037, |
|
"learning_rate": 4.4578853046594983e-05, |
|
"loss": 0.7276, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8162344983089064, |
|
"eval_loss": 0.5563604235649109, |
|
"eval_runtime": 20.2707, |
|
"eval_samples_per_second": 87.515, |
|
"eval_steps_per_second": 2.763, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.008016032064128, |
|
"grad_norm": 6.828517913818359, |
|
"learning_rate": 4.435483870967742e-05, |
|
"loss": 0.7295, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.0881763527054105, |
|
"grad_norm": 8.60299301147461, |
|
"learning_rate": 4.413082437275986e-05, |
|
"loss": 0.7452, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.168336673346693, |
|
"grad_norm": 7.014772415161133, |
|
"learning_rate": 4.390681003584229e-05, |
|
"loss": 0.7108, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.248496993987976, |
|
"grad_norm": 10.6813325881958, |
|
"learning_rate": 4.368279569892473e-05, |
|
"loss": 0.7479, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.328657314629258, |
|
"grad_norm": 6.949085712432861, |
|
"learning_rate": 4.345878136200717e-05, |
|
"loss": 0.6518, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.408817635270541, |
|
"grad_norm": 6.594024658203125, |
|
"learning_rate": 4.323476702508961e-05, |
|
"loss": 0.7169, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.488977955911824, |
|
"grad_norm": 8.18333625793457, |
|
"learning_rate": 4.301075268817205e-05, |
|
"loss": 0.7185, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.569138276553106, |
|
"grad_norm": 5.988694190979004, |
|
"learning_rate": 4.2786738351254486e-05, |
|
"loss": 0.6685, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.649298597194389, |
|
"grad_norm": 7.308699607849121, |
|
"learning_rate": 4.256272401433692e-05, |
|
"loss": 0.6542, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.729458917835672, |
|
"grad_norm": 7.458045959472656, |
|
"learning_rate": 4.2338709677419356e-05, |
|
"loss": 0.6993, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.809619238476954, |
|
"grad_norm": 8.139283180236816, |
|
"learning_rate": 4.2114695340501795e-05, |
|
"loss": 0.7078, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.889779559118237, |
|
"grad_norm": 6.669909954071045, |
|
"learning_rate": 4.1890681003584233e-05, |
|
"loss": 0.6627, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.969939879759519, |
|
"grad_norm": 6.659294605255127, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.6425, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.993987975951904, |
|
"eval_accuracy": 0.8162344983089064, |
|
"eval_loss": 0.5226049423217773, |
|
"eval_runtime": 13.1794, |
|
"eval_samples_per_second": 134.604, |
|
"eval_steps_per_second": 4.249, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 5.050100200400801, |
|
"grad_norm": 6.521961212158203, |
|
"learning_rate": 4.1442652329749104e-05, |
|
"loss": 0.6918, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.130260521042084, |
|
"grad_norm": 8.646223068237305, |
|
"learning_rate": 4.121863799283154e-05, |
|
"loss": 0.6712, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.210420841683367, |
|
"grad_norm": 5.398332118988037, |
|
"learning_rate": 4.099462365591398e-05, |
|
"loss": 0.6729, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.290581162324649, |
|
"grad_norm": 8.448481559753418, |
|
"learning_rate": 4.077060931899642e-05, |
|
"loss": 0.653, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.370741482965932, |
|
"grad_norm": 6.562283992767334, |
|
"learning_rate": 4.054659498207886e-05, |
|
"loss": 0.6835, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.4509018036072145, |
|
"grad_norm": 8.798486709594727, |
|
"learning_rate": 4.032258064516129e-05, |
|
"loss": 0.6849, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.531062124248497, |
|
"grad_norm": 8.284408569335938, |
|
"learning_rate": 4.009856630824373e-05, |
|
"loss": 0.6649, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.61122244488978, |
|
"grad_norm": 6.30844259262085, |
|
"learning_rate": 3.987455197132617e-05, |
|
"loss": 0.6893, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.6913827655310625, |
|
"grad_norm": 7.219119548797607, |
|
"learning_rate": 3.96505376344086e-05, |
|
"loss": 0.7007, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.771543086172345, |
|
"grad_norm": 8.133257865905762, |
|
"learning_rate": 3.9426523297491045e-05, |
|
"loss": 0.6595, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.851703406813627, |
|
"grad_norm": 7.415875434875488, |
|
"learning_rate": 3.9202508960573483e-05, |
|
"loss": 0.628, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.9318637274549095, |
|
"grad_norm": 7.713524341583252, |
|
"learning_rate": 3.8978494623655915e-05, |
|
"loss": 0.6518, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.995991983967936, |
|
"eval_accuracy": 0.818489289740699, |
|
"eval_loss": 0.537726879119873, |
|
"eval_runtime": 20.02, |
|
"eval_samples_per_second": 88.611, |
|
"eval_steps_per_second": 2.797, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 6.012024048096192, |
|
"grad_norm": 7.509452819824219, |
|
"learning_rate": 3.8754480286738354e-05, |
|
"loss": 0.6262, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.092184368737475, |
|
"grad_norm": 6.618509769439697, |
|
"learning_rate": 3.8530465949820786e-05, |
|
"loss": 0.6559, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.1723446893787575, |
|
"grad_norm": 5.969357490539551, |
|
"learning_rate": 3.8306451612903224e-05, |
|
"loss": 0.6324, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.25250501002004, |
|
"grad_norm": 7.1045918464660645, |
|
"learning_rate": 3.808243727598566e-05, |
|
"loss": 0.5872, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.332665330661323, |
|
"grad_norm": 6.669059753417969, |
|
"learning_rate": 3.78584229390681e-05, |
|
"loss": 0.581, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.412825651302605, |
|
"grad_norm": 7.305534839630127, |
|
"learning_rate": 3.763440860215054e-05, |
|
"loss": 0.6319, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.492985971943888, |
|
"grad_norm": 5.811188697814941, |
|
"learning_rate": 3.741039426523298e-05, |
|
"loss": 0.6461, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.573146292585171, |
|
"grad_norm": 5.710335731506348, |
|
"learning_rate": 3.718637992831541e-05, |
|
"loss": 0.6673, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.653306613226453, |
|
"grad_norm": 8.094255447387695, |
|
"learning_rate": 3.696236559139785e-05, |
|
"loss": 0.6506, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.733466933867735, |
|
"grad_norm": 6.600334644317627, |
|
"learning_rate": 3.673835125448029e-05, |
|
"loss": 0.6437, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.813627254509018, |
|
"grad_norm": 7.381925106048584, |
|
"learning_rate": 3.651433691756273e-05, |
|
"loss": 0.6173, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.8937875751503, |
|
"grad_norm": 7.205611705780029, |
|
"learning_rate": 3.6290322580645165e-05, |
|
"loss": 0.6427, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.973947895791583, |
|
"grad_norm": 6.106593608856201, |
|
"learning_rate": 3.60663082437276e-05, |
|
"loss": 0.6096, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.997995991983968, |
|
"eval_accuracy": 0.8218714768883878, |
|
"eval_loss": 0.5341029167175293, |
|
"eval_runtime": 13.1967, |
|
"eval_samples_per_second": 134.427, |
|
"eval_steps_per_second": 4.243, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 7.054108216432866, |
|
"grad_norm": 7.432844161987305, |
|
"learning_rate": 3.5842293906810036e-05, |
|
"loss": 0.6373, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.134268537074148, |
|
"grad_norm": 7.420022010803223, |
|
"learning_rate": 3.5618279569892474e-05, |
|
"loss": 0.6481, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 7.214428857715431, |
|
"grad_norm": 7.222751617431641, |
|
"learning_rate": 3.539426523297491e-05, |
|
"loss": 0.6267, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.294589178356714, |
|
"grad_norm": 7.050006866455078, |
|
"learning_rate": 3.517025089605735e-05, |
|
"loss": 0.6107, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.374749498997996, |
|
"grad_norm": 8.168829917907715, |
|
"learning_rate": 3.494623655913979e-05, |
|
"loss": 0.6008, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.454909819639279, |
|
"grad_norm": 5.229215145111084, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 0.6355, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.5350701402805615, |
|
"grad_norm": 6.677180290222168, |
|
"learning_rate": 3.449820788530466e-05, |
|
"loss": 0.6021, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.615230460921843, |
|
"grad_norm": 5.7284698486328125, |
|
"learning_rate": 3.427419354838709e-05, |
|
"loss": 0.602, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.695390781563126, |
|
"grad_norm": 7.613159656524658, |
|
"learning_rate": 3.405017921146954e-05, |
|
"loss": 0.578, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.775551102204409, |
|
"grad_norm": 7.990455150604248, |
|
"learning_rate": 3.382616487455198e-05, |
|
"loss": 0.6064, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.855711422845691, |
|
"grad_norm": 7.88253116607666, |
|
"learning_rate": 3.360215053763441e-05, |
|
"loss": 0.5796, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.935871743486974, |
|
"grad_norm": 6.4569091796875, |
|
"learning_rate": 3.337813620071685e-05, |
|
"loss": 0.6282, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8399098083427283, |
|
"eval_loss": 0.4718434512615204, |
|
"eval_runtime": 18.7441, |
|
"eval_samples_per_second": 94.643, |
|
"eval_steps_per_second": 2.988, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 8.016032064128256, |
|
"grad_norm": 6.175160884857178, |
|
"learning_rate": 3.3154121863799286e-05, |
|
"loss": 0.5892, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.09619238476954, |
|
"grad_norm": 6.699339389801025, |
|
"learning_rate": 3.293010752688172e-05, |
|
"loss": 0.5914, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 8.176352705410821, |
|
"grad_norm": 7.189827919006348, |
|
"learning_rate": 3.270609318996416e-05, |
|
"loss": 0.5755, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 8.256513026052104, |
|
"grad_norm": 7.274308204650879, |
|
"learning_rate": 3.24820788530466e-05, |
|
"loss": 0.6348, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 8.336673346693386, |
|
"grad_norm": 5.570709228515625, |
|
"learning_rate": 3.2258064516129034e-05, |
|
"loss": 0.589, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.41683366733467, |
|
"grad_norm": 6.513092994689941, |
|
"learning_rate": 3.203405017921147e-05, |
|
"loss": 0.6085, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.496993987975952, |
|
"grad_norm": 6.354593753814697, |
|
"learning_rate": 3.1810035842293904e-05, |
|
"loss": 0.5925, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.577154308617235, |
|
"grad_norm": 6.1644392013549805, |
|
"learning_rate": 3.158602150537634e-05, |
|
"loss": 0.5287, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 8.657314629258517, |
|
"grad_norm": 7.383876800537109, |
|
"learning_rate": 3.136200716845878e-05, |
|
"loss": 0.6475, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.7374749498998, |
|
"grad_norm": 9.010411262512207, |
|
"learning_rate": 3.113799283154122e-05, |
|
"loss": 0.5934, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 8.817635270541082, |
|
"grad_norm": 5.401876449584961, |
|
"learning_rate": 3.091397849462366e-05, |
|
"loss": 0.5658, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.897795591182366, |
|
"grad_norm": 5.095533847808838, |
|
"learning_rate": 3.06899641577061e-05, |
|
"loss": 0.5791, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.977955911823647, |
|
"grad_norm": 6.6335248947143555, |
|
"learning_rate": 3.046594982078853e-05, |
|
"loss": 0.5394, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.993987975951903, |
|
"eval_accuracy": 0.8280721533258174, |
|
"eval_loss": 0.5112709999084473, |
|
"eval_runtime": 25.2314, |
|
"eval_samples_per_second": 70.309, |
|
"eval_steps_per_second": 2.219, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 9.05811623246493, |
|
"grad_norm": 6.276222229003906, |
|
"learning_rate": 3.024193548387097e-05, |
|
"loss": 0.586, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 9.138276553106213, |
|
"grad_norm": 5.882013320922852, |
|
"learning_rate": 3.0017921146953403e-05, |
|
"loss": 0.6203, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 9.218436873747494, |
|
"grad_norm": 7.025397777557373, |
|
"learning_rate": 2.979390681003584e-05, |
|
"loss": 0.5429, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 9.298597194388778, |
|
"grad_norm": 5.535187244415283, |
|
"learning_rate": 2.9569892473118284e-05, |
|
"loss": 0.5571, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 9.37875751503006, |
|
"grad_norm": 7.409646987915039, |
|
"learning_rate": 2.9345878136200715e-05, |
|
"loss": 0.5557, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 9.458917835671343, |
|
"grad_norm": 6.127359390258789, |
|
"learning_rate": 2.9121863799283154e-05, |
|
"loss": 0.5341, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 9.539078156312625, |
|
"grad_norm": 5.265384674072266, |
|
"learning_rate": 2.8897849462365596e-05, |
|
"loss": 0.5994, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 9.619238476953909, |
|
"grad_norm": 6.010611534118652, |
|
"learning_rate": 2.8673835125448028e-05, |
|
"loss": 0.5504, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.69939879759519, |
|
"grad_norm": 7.495913505554199, |
|
"learning_rate": 2.8449820788530467e-05, |
|
"loss": 0.5807, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 9.779559118236474, |
|
"grad_norm": 8.004411697387695, |
|
"learning_rate": 2.822580645161291e-05, |
|
"loss": 0.5911, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 9.859719438877756, |
|
"grad_norm": 7.179277420043945, |
|
"learning_rate": 2.800179211469534e-05, |
|
"loss": 0.5128, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 9.939879759519037, |
|
"grad_norm": 6.937490940093994, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.5718, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 9.995991983967937, |
|
"eval_accuracy": 0.8291995490417137, |
|
"eval_loss": 0.5018876791000366, |
|
"eval_runtime": 16.3385, |
|
"eval_samples_per_second": 108.578, |
|
"eval_steps_per_second": 3.427, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 10.02004008016032, |
|
"grad_norm": 7.811807632446289, |
|
"learning_rate": 2.7553763440860214e-05, |
|
"loss": 0.5727, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 10.100200400801603, |
|
"grad_norm": 7.441296577453613, |
|
"learning_rate": 2.7329749103942653e-05, |
|
"loss": 0.5742, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 10.180360721442886, |
|
"grad_norm": 6.93259334564209, |
|
"learning_rate": 2.710573476702509e-05, |
|
"loss": 0.4814, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 10.260521042084168, |
|
"grad_norm": 7.736974716186523, |
|
"learning_rate": 2.6881720430107527e-05, |
|
"loss": 0.5448, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 10.340681362725451, |
|
"grad_norm": 7.408446788787842, |
|
"learning_rate": 2.6657706093189965e-05, |
|
"loss": 0.5892, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 10.420841683366733, |
|
"grad_norm": 6.906106472015381, |
|
"learning_rate": 2.6433691756272404e-05, |
|
"loss": 0.5175, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.501002004008017, |
|
"grad_norm": 5.426215648651123, |
|
"learning_rate": 2.620967741935484e-05, |
|
"loss": 0.5977, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 10.581162324649299, |
|
"grad_norm": 5.591187477111816, |
|
"learning_rate": 2.5985663082437278e-05, |
|
"loss": 0.5157, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 10.661322645290582, |
|
"grad_norm": 7.416080474853516, |
|
"learning_rate": 2.5761648745519713e-05, |
|
"loss": 0.5578, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 10.741482965931864, |
|
"grad_norm": 6.815114498138428, |
|
"learning_rate": 2.5537634408602152e-05, |
|
"loss": 0.5387, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 10.821643286573146, |
|
"grad_norm": 8.681703567504883, |
|
"learning_rate": 2.531362007168459e-05, |
|
"loss": 0.5299, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 10.901803607214429, |
|
"grad_norm": 5.358316421508789, |
|
"learning_rate": 2.5089605734767026e-05, |
|
"loss": 0.59, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 10.98196392785571, |
|
"grad_norm": 8.894550323486328, |
|
"learning_rate": 2.4865591397849464e-05, |
|
"loss": 0.5507, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 10.997995991983968, |
|
"eval_accuracy": 0.8461104847801578, |
|
"eval_loss": 0.4545128643512726, |
|
"eval_runtime": 17.189, |
|
"eval_samples_per_second": 103.206, |
|
"eval_steps_per_second": 3.258, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 11.062124248496994, |
|
"grad_norm": 6.441153526306152, |
|
"learning_rate": 2.46415770609319e-05, |
|
"loss": 0.5485, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 11.142284569138276, |
|
"grad_norm": 5.337042808532715, |
|
"learning_rate": 2.4417562724014338e-05, |
|
"loss": 0.4986, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 11.22244488977956, |
|
"grad_norm": 5.681359767913818, |
|
"learning_rate": 2.4193548387096777e-05, |
|
"loss": 0.5213, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 11.302605210420841, |
|
"grad_norm": 6.7940778732299805, |
|
"learning_rate": 2.3969534050179212e-05, |
|
"loss": 0.5415, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 11.382765531062125, |
|
"grad_norm": 5.4109930992126465, |
|
"learning_rate": 2.374551971326165e-05, |
|
"loss": 0.5748, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 11.462925851703407, |
|
"grad_norm": 7.78901481628418, |
|
"learning_rate": 2.352150537634409e-05, |
|
"loss": 0.528, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 11.54308617234469, |
|
"grad_norm": 5.304915904998779, |
|
"learning_rate": 2.3297491039426525e-05, |
|
"loss": 0.5709, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 11.623246492985972, |
|
"grad_norm": 6.8759846687316895, |
|
"learning_rate": 2.307347670250896e-05, |
|
"loss": 0.5708, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 11.703406813627254, |
|
"grad_norm": 5.734496593475342, |
|
"learning_rate": 2.28494623655914e-05, |
|
"loss": 0.5178, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 11.783567134268537, |
|
"grad_norm": 7.169252395629883, |
|
"learning_rate": 2.2625448028673837e-05, |
|
"loss": 0.5595, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 11.863727454909819, |
|
"grad_norm": 6.391491413116455, |
|
"learning_rate": 2.2401433691756272e-05, |
|
"loss": 0.5883, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 11.943887775551103, |
|
"grad_norm": 5.931715965270996, |
|
"learning_rate": 2.217741935483871e-05, |
|
"loss": 0.4921, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8416009019165727, |
|
"eval_loss": 0.46128037571907043, |
|
"eval_runtime": 27.7579, |
|
"eval_samples_per_second": 63.91, |
|
"eval_steps_per_second": 2.017, |
|
"step": 1497 |
|
}, |
|
{ |
|
"epoch": 12.024048096192384, |
|
"grad_norm": 5.848583221435547, |
|
"learning_rate": 2.1953405017921146e-05, |
|
"loss": 0.5388, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 12.104208416833668, |
|
"grad_norm": 5.273708343505859, |
|
"learning_rate": 2.1729390681003585e-05, |
|
"loss": 0.5259, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 12.18436873747495, |
|
"grad_norm": 6.022935390472412, |
|
"learning_rate": 2.1505376344086024e-05, |
|
"loss": 0.4928, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 12.264529058116233, |
|
"grad_norm": 4.965794563293457, |
|
"learning_rate": 2.128136200716846e-05, |
|
"loss": 0.537, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 12.344689378757515, |
|
"grad_norm": 6.983731746673584, |
|
"learning_rate": 2.1057347670250897e-05, |
|
"loss": 0.5258, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 12.424849699398798, |
|
"grad_norm": 6.290835380554199, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.5411, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 12.50501002004008, |
|
"grad_norm": 6.071152210235596, |
|
"learning_rate": 2.060931899641577e-05, |
|
"loss": 0.5421, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 12.585170340681362, |
|
"grad_norm": 7.99808931350708, |
|
"learning_rate": 2.038530465949821e-05, |
|
"loss": 0.5741, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 12.665330661322646, |
|
"grad_norm": 7.839056015014648, |
|
"learning_rate": 2.0161290322580645e-05, |
|
"loss": 0.5069, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 12.745490981963927, |
|
"grad_norm": 6.645950794219971, |
|
"learning_rate": 1.9937275985663084e-05, |
|
"loss": 0.5598, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 12.82565130260521, |
|
"grad_norm": 6.195275783538818, |
|
"learning_rate": 1.9713261648745522e-05, |
|
"loss": 0.5128, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 12.905811623246493, |
|
"grad_norm": 6.307319164276123, |
|
"learning_rate": 1.9489247311827958e-05, |
|
"loss": 0.5218, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 12.985971943887776, |
|
"grad_norm": 5.337151527404785, |
|
"learning_rate": 1.9265232974910393e-05, |
|
"loss": 0.5571, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 12.993987975951903, |
|
"eval_accuracy": 0.8416009019165727, |
|
"eval_loss": 0.45865094661712646, |
|
"eval_runtime": 13.0045, |
|
"eval_samples_per_second": 136.414, |
|
"eval_steps_per_second": 4.306, |
|
"step": 1621 |
|
}, |
|
{ |
|
"epoch": 13.066132264529058, |
|
"grad_norm": 5.886476516723633, |
|
"learning_rate": 1.904121863799283e-05, |
|
"loss": 0.5145, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 13.146292585170341, |
|
"grad_norm": 7.12263298034668, |
|
"learning_rate": 1.881720430107527e-05, |
|
"loss": 0.4775, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 13.226452905811623, |
|
"grad_norm": 6.896437168121338, |
|
"learning_rate": 1.8593189964157705e-05, |
|
"loss": 0.4922, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 13.306613226452907, |
|
"grad_norm": 7.87682580947876, |
|
"learning_rate": 1.8369175627240144e-05, |
|
"loss": 0.5151, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 13.386773547094188, |
|
"grad_norm": 6.32350492477417, |
|
"learning_rate": 1.8145161290322583e-05, |
|
"loss": 0.5308, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 13.46693386773547, |
|
"grad_norm": 6.5004353523254395, |
|
"learning_rate": 1.7921146953405018e-05, |
|
"loss": 0.4914, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 13.547094188376754, |
|
"grad_norm": 6.300237655639648, |
|
"learning_rate": 1.7697132616487457e-05, |
|
"loss": 0.5254, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 13.627254509018035, |
|
"grad_norm": 6.251715660095215, |
|
"learning_rate": 1.7473118279569895e-05, |
|
"loss": 0.5197, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 13.707414829659319, |
|
"grad_norm": 8.49095630645752, |
|
"learning_rate": 1.724910394265233e-05, |
|
"loss": 0.5269, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 13.7875751503006, |
|
"grad_norm": 6.206210613250732, |
|
"learning_rate": 1.702508960573477e-05, |
|
"loss": 0.5317, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 13.867735470941884, |
|
"grad_norm": 6.281041145324707, |
|
"learning_rate": 1.6801075268817204e-05, |
|
"loss": 0.5212, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 13.947895791583166, |
|
"grad_norm": 5.863707542419434, |
|
"learning_rate": 1.6577060931899643e-05, |
|
"loss": 0.512, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 13.995991983967937, |
|
"eval_accuracy": 0.8511837655016911, |
|
"eval_loss": 0.46732643246650696, |
|
"eval_runtime": 20.8784, |
|
"eval_samples_per_second": 84.968, |
|
"eval_steps_per_second": 2.682, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 14.02805611222445, |
|
"grad_norm": 7.321824550628662, |
|
"learning_rate": 1.635304659498208e-05, |
|
"loss": 0.5023, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 14.108216432865731, |
|
"grad_norm": 7.074238300323486, |
|
"learning_rate": 1.6129032258064517e-05, |
|
"loss": 0.5645, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 14.188376753507015, |
|
"grad_norm": 4.364939212799072, |
|
"learning_rate": 1.5905017921146952e-05, |
|
"loss": 0.4858, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 14.268537074148297, |
|
"grad_norm": 6.330202102661133, |
|
"learning_rate": 1.568100358422939e-05, |
|
"loss": 0.5145, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 14.348697394789578, |
|
"grad_norm": 6.827199935913086, |
|
"learning_rate": 1.545698924731183e-05, |
|
"loss": 0.4995, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 14.428857715430862, |
|
"grad_norm": 5.992321014404297, |
|
"learning_rate": 1.5232974910394265e-05, |
|
"loss": 0.511, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 14.509018036072144, |
|
"grad_norm": 6.993434429168701, |
|
"learning_rate": 1.5008960573476701e-05, |
|
"loss": 0.535, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 14.589178356713427, |
|
"grad_norm": 6.39487886428833, |
|
"learning_rate": 1.4784946236559142e-05, |
|
"loss": 0.4754, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 14.669338677354709, |
|
"grad_norm": 7.741076946258545, |
|
"learning_rate": 1.4560931899641577e-05, |
|
"loss": 0.488, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 14.749498997995993, |
|
"grad_norm": 6.242033958435059, |
|
"learning_rate": 1.4336917562724014e-05, |
|
"loss": 0.4965, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 14.829659318637274, |
|
"grad_norm": 7.820639610290527, |
|
"learning_rate": 1.4112903225806454e-05, |
|
"loss": 0.5113, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 14.909819639278558, |
|
"grad_norm": 5.047755718231201, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.4945, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 14.98997995991984, |
|
"grad_norm": 6.295690059661865, |
|
"learning_rate": 1.3664874551971326e-05, |
|
"loss": 0.4855, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 14.997995991983968, |
|
"eval_accuracy": 0.8489289740698985, |
|
"eval_loss": 0.4640846252441406, |
|
"eval_runtime": 12.5307, |
|
"eval_samples_per_second": 141.573, |
|
"eval_steps_per_second": 4.469, |
|
"step": 1871 |
|
}, |
|
{ |
|
"epoch": 15.070140280561123, |
|
"grad_norm": 5.256791114807129, |
|
"learning_rate": 1.3440860215053763e-05, |
|
"loss": 0.4476, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 15.150300601202405, |
|
"grad_norm": 6.767005920410156, |
|
"learning_rate": 1.3216845878136202e-05, |
|
"loss": 0.5136, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 15.230460921843687, |
|
"grad_norm": 6.730881690979004, |
|
"learning_rate": 1.2992831541218639e-05, |
|
"loss": 0.4952, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 15.31062124248497, |
|
"grad_norm": 5.721596717834473, |
|
"learning_rate": 1.2768817204301076e-05, |
|
"loss": 0.4984, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 15.390781563126252, |
|
"grad_norm": 5.367898941040039, |
|
"learning_rate": 1.2544802867383513e-05, |
|
"loss": 0.5366, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 15.470941883767535, |
|
"grad_norm": 7.774703502655029, |
|
"learning_rate": 1.232078853046595e-05, |
|
"loss": 0.4651, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 15.551102204408817, |
|
"grad_norm": 5.738451957702637, |
|
"learning_rate": 1.2096774193548388e-05, |
|
"loss": 0.4803, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 15.6312625250501, |
|
"grad_norm": 8.123086929321289, |
|
"learning_rate": 1.1872759856630825e-05, |
|
"loss": 0.5122, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 15.711422845691382, |
|
"grad_norm": 7.48280668258667, |
|
"learning_rate": 1.1648745519713262e-05, |
|
"loss": 0.4913, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 15.791583166332666, |
|
"grad_norm": 6.538034439086914, |
|
"learning_rate": 1.14247311827957e-05, |
|
"loss": 0.4653, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 15.871743486973948, |
|
"grad_norm": 6.203965663909912, |
|
"learning_rate": 1.1200716845878136e-05, |
|
"loss": 0.5026, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 15.951903807615231, |
|
"grad_norm": 7.2120490074157715, |
|
"learning_rate": 1.0976702508960573e-05, |
|
"loss": 0.4895, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.8449830890642616, |
|
"eval_loss": 0.4555908143520355, |
|
"eval_runtime": 23.4028, |
|
"eval_samples_per_second": 75.803, |
|
"eval_steps_per_second": 2.393, |
|
"step": 1996 |
|
}, |
|
{ |
|
"epoch": 16.03206412825651, |
|
"grad_norm": 5.893616199493408, |
|
"learning_rate": 1.0752688172043012e-05, |
|
"loss": 0.4819, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 16.112224448897795, |
|
"grad_norm": 7.0060133934021, |
|
"learning_rate": 1.0528673835125449e-05, |
|
"loss": 0.4928, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 16.19238476953908, |
|
"grad_norm": 5.684309005737305, |
|
"learning_rate": 1.0304659498207886e-05, |
|
"loss": 0.441, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 16.272545090180362, |
|
"grad_norm": 7.170827865600586, |
|
"learning_rate": 1.0080645161290323e-05, |
|
"loss": 0.4686, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 16.352705410821642, |
|
"grad_norm": 6.788947105407715, |
|
"learning_rate": 9.856630824372761e-06, |
|
"loss": 0.509, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 16.432865731462925, |
|
"grad_norm": 7.052069187164307, |
|
"learning_rate": 9.632616487455196e-06, |
|
"loss": 0.4727, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 16.51302605210421, |
|
"grad_norm": 6.445401668548584, |
|
"learning_rate": 9.408602150537635e-06, |
|
"loss": 0.5347, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 16.593186372745492, |
|
"grad_norm": 9.481761932373047, |
|
"learning_rate": 9.184587813620072e-06, |
|
"loss": 0.5089, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 16.673346693386772, |
|
"grad_norm": 5.852792739868164, |
|
"learning_rate": 8.960573476702509e-06, |
|
"loss": 0.4969, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 16.753507014028056, |
|
"grad_norm": 6.045396327972412, |
|
"learning_rate": 8.736559139784948e-06, |
|
"loss": 0.522, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 16.83366733466934, |
|
"grad_norm": 6.488787651062012, |
|
"learning_rate": 8.512544802867385e-06, |
|
"loss": 0.4813, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 16.91382765531062, |
|
"grad_norm": 7.611959934234619, |
|
"learning_rate": 8.288530465949821e-06, |
|
"loss": 0.4662, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 16.993987975951903, |
|
"grad_norm": 7.92677640914917, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.4809, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 16.993987975951903, |
|
"eval_accuracy": 0.8523111612175873, |
|
"eval_loss": 0.4317234456539154, |
|
"eval_runtime": 20.2324, |
|
"eval_samples_per_second": 87.681, |
|
"eval_steps_per_second": 2.768, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 17.074148296593187, |
|
"grad_norm": 6.190919876098633, |
|
"learning_rate": 7.840501792114695e-06, |
|
"loss": 0.4918, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 17.15430861723447, |
|
"grad_norm": 6.092956066131592, |
|
"learning_rate": 7.616487455197132e-06, |
|
"loss": 0.4602, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 17.23446893787575, |
|
"grad_norm": 7.813562870025635, |
|
"learning_rate": 7.392473118279571e-06, |
|
"loss": 0.5014, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 17.314629258517034, |
|
"grad_norm": 7.895810127258301, |
|
"learning_rate": 7.168458781362007e-06, |
|
"loss": 0.4704, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 17.394789579158317, |
|
"grad_norm": 7.144327640533447, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.5298, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 17.4749498997996, |
|
"grad_norm": 6.57069730758667, |
|
"learning_rate": 6.720430107526882e-06, |
|
"loss": 0.4797, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 17.55511022044088, |
|
"grad_norm": 5.174849510192871, |
|
"learning_rate": 6.4964157706093195e-06, |
|
"loss": 0.4697, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 17.635270541082164, |
|
"grad_norm": 4.489311218261719, |
|
"learning_rate": 6.2724014336917564e-06, |
|
"loss": 0.4764, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 17.715430861723448, |
|
"grad_norm": 8.91657829284668, |
|
"learning_rate": 6.048387096774194e-06, |
|
"loss": 0.4229, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 17.79559118236473, |
|
"grad_norm": 8.482898712158203, |
|
"learning_rate": 5.824372759856631e-06, |
|
"loss": 0.4835, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 17.87575150300601, |
|
"grad_norm": 7.158608436584473, |
|
"learning_rate": 5.600358422939068e-06, |
|
"loss": 0.4764, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 17.955911823647295, |
|
"grad_norm": 7.100325107574463, |
|
"learning_rate": 5.376344086021506e-06, |
|
"loss": 0.4785, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 17.995991983967937, |
|
"eval_accuracy": 0.8534385569334837, |
|
"eval_loss": 0.4337688088417053, |
|
"eval_runtime": 14.1747, |
|
"eval_samples_per_second": 125.152, |
|
"eval_steps_per_second": 3.951, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 18.03607214428858, |
|
"grad_norm": 6.301604747772217, |
|
"learning_rate": 5.152329749103943e-06, |
|
"loss": 0.4679, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 18.11623246492986, |
|
"grad_norm": 8.363497734069824, |
|
"learning_rate": 4.928315412186381e-06, |
|
"loss": 0.4164, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 18.196392785571142, |
|
"grad_norm": 6.71609354019165, |
|
"learning_rate": 4.7043010752688175e-06, |
|
"loss": 0.4596, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 18.276553106212425, |
|
"grad_norm": 5.690964221954346, |
|
"learning_rate": 4.4802867383512545e-06, |
|
"loss": 0.4774, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 18.35671342685371, |
|
"grad_norm": 7.212980270385742, |
|
"learning_rate": 4.256272401433692e-06, |
|
"loss": 0.5049, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 18.43687374749499, |
|
"grad_norm": 7.2161149978637695, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.4504, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 18.517034068136272, |
|
"grad_norm": 5.582963466644287, |
|
"learning_rate": 3.808243727598566e-06, |
|
"loss": 0.4624, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 18.597194388777556, |
|
"grad_norm": 6.577459335327148, |
|
"learning_rate": 3.5842293906810035e-06, |
|
"loss": 0.4636, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 18.677354709418836, |
|
"grad_norm": 6.8889594078063965, |
|
"learning_rate": 3.360215053763441e-06, |
|
"loss": 0.4418, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 18.75751503006012, |
|
"grad_norm": 7.460567951202393, |
|
"learning_rate": 3.1362007168458782e-06, |
|
"loss": 0.49, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 18.837675350701403, |
|
"grad_norm": 6.316689491271973, |
|
"learning_rate": 2.9121863799283156e-06, |
|
"loss": 0.4392, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 18.917835671342687, |
|
"grad_norm": 7.855792999267578, |
|
"learning_rate": 2.688172043010753e-06, |
|
"loss": 0.4779, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 18.997995991983966, |
|
"grad_norm": 5.641603946685791, |
|
"learning_rate": 2.4641577060931903e-06, |
|
"loss": 0.444, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 18.997995991983966, |
|
"eval_accuracy": 0.8579481397970687, |
|
"eval_loss": 0.4356663227081299, |
|
"eval_runtime": 12.8983, |
|
"eval_samples_per_second": 137.538, |
|
"eval_steps_per_second": 4.342, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 19.07815631262525, |
|
"grad_norm": 7.950003623962402, |
|
"learning_rate": 2.2401433691756272e-06, |
|
"loss": 0.4587, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 19.158316633266534, |
|
"grad_norm": 6.9599995613098145, |
|
"learning_rate": 2.0161290322580646e-06, |
|
"loss": 0.4754, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 19.238476953907817, |
|
"grad_norm": 6.827354907989502, |
|
"learning_rate": 1.7921146953405017e-06, |
|
"loss": 0.4576, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 19.318637274549097, |
|
"grad_norm": 5.975595951080322, |
|
"learning_rate": 1.5681003584229391e-06, |
|
"loss": 0.427, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 19.39879759519038, |
|
"grad_norm": 7.829305648803711, |
|
"learning_rate": 1.3440860215053765e-06, |
|
"loss": 0.5141, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 19.478957915831664, |
|
"grad_norm": 5.588257312774658, |
|
"learning_rate": 1.1200716845878136e-06, |
|
"loss": 0.4609, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 19.559118236472948, |
|
"grad_norm": 8.075860023498535, |
|
"learning_rate": 8.960573476702509e-07, |
|
"loss": 0.4977, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 19.639278557114228, |
|
"grad_norm": 7.977848052978516, |
|
"learning_rate": 6.720430107526882e-07, |
|
"loss": 0.4329, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 19.71943887775551, |
|
"grad_norm": 7.050076961517334, |
|
"learning_rate": 4.4802867383512544e-07, |
|
"loss": 0.4613, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 19.799599198396795, |
|
"grad_norm": 6.357409954071045, |
|
"learning_rate": 2.2401433691756272e-07, |
|
"loss": 0.4457, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 19.879759519038075, |
|
"grad_norm": 4.937966346740723, |
|
"learning_rate": 0.0, |
|
"loss": 0.4255, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 19.879759519038075, |
|
"eval_accuracy": 0.8585118376550169, |
|
"eval_loss": 0.4356611371040344, |
|
"eval_runtime": 26.523, |
|
"eval_samples_per_second": 66.885, |
|
"eval_steps_per_second": 2.111, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 19.879759519038075, |
|
"step": 2480, |
|
"total_flos": 1.0326291224762253e+19, |
|
"train_loss": 0.6660777115052746, |
|
"train_runtime": 7387.7979, |
|
"train_samples_per_second": 43.212, |
|
"train_steps_per_second": 0.336 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2480, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 1.0326291224762253e+19, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|