{ "best_metric": 0.8585118376550169, "best_model_checkpoint": "swinv2-tiny-patch4-window8-256-finetuned-galaxy10-decals/checkpoint-2480", "epoch": 19.879759519038075, "eval_steps": 500, "global_step": 2480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08016032064128256, "grad_norm": 3.107008457183838, "learning_rate": 2.0161290322580646e-06, "loss": 2.3373, "step": 10 }, { "epoch": 0.16032064128256512, "grad_norm": 3.129793167114258, "learning_rate": 4.032258064516129e-06, "loss": 2.2991, "step": 20 }, { "epoch": 0.24048096192384769, "grad_norm": 3.2072296142578125, "learning_rate": 6.048387096774194e-06, "loss": 2.2401, "step": 30 }, { "epoch": 0.32064128256513025, "grad_norm": 3.172947406768799, "learning_rate": 8.064516129032258e-06, "loss": 2.1683, "step": 40 }, { "epoch": 0.40080160320641284, "grad_norm": 4.017691612243652, "learning_rate": 1.0080645161290323e-05, "loss": 2.0858, "step": 50 }, { "epoch": 0.48096192384769537, "grad_norm": 4.591518402099609, "learning_rate": 1.2096774193548388e-05, "loss": 1.9855, "step": 60 }, { "epoch": 0.561122244488978, "grad_norm": 6.409041404724121, "learning_rate": 1.4112903225806454e-05, "loss": 1.8467, "step": 70 }, { "epoch": 0.6412825651302605, "grad_norm": 7.812229633331299, "learning_rate": 1.6129032258064517e-05, "loss": 1.7127, "step": 80 }, { "epoch": 0.7214428857715431, "grad_norm": 9.216848373413086, "learning_rate": 1.8145161290322583e-05, "loss": 1.551, "step": 90 }, { "epoch": 0.8016032064128257, "grad_norm": 7.334836483001709, "learning_rate": 2.0161290322580645e-05, "loss": 1.4516, "step": 100 }, { "epoch": 0.8817635270541082, "grad_norm": 7.710146427154541, "learning_rate": 2.217741935483871e-05, "loss": 1.3589, "step": 110 }, { "epoch": 0.9619238476953907, "grad_norm": 11.563941955566406, "learning_rate": 2.4193548387096777e-05, "loss": 1.318, "step": 120 }, { "epoch": 0.9939879759519038, "eval_accuracy": 0.6358511837655016, "eval_loss": 1.0409355163574219, "eval_runtime": 18.6446, "eval_samples_per_second": 95.148, "eval_steps_per_second": 3.004, "step": 124 }, { "epoch": 1.0420841683366733, "grad_norm": 9.838645935058594, "learning_rate": 2.620967741935484e-05, "loss": 1.2543, "step": 130 }, { "epoch": 1.122244488977956, "grad_norm": 9.036652565002441, "learning_rate": 2.822580645161291e-05, "loss": 1.2318, "step": 140 }, { "epoch": 1.2024048096192386, "grad_norm": 7.428995132446289, "learning_rate": 3.024193548387097e-05, "loss": 1.2297, "step": 150 }, { "epoch": 1.282565130260521, "grad_norm": 7.845966815948486, "learning_rate": 3.2258064516129034e-05, "loss": 1.1232, "step": 160 }, { "epoch": 1.3627254509018036, "grad_norm": 7.774607181549072, "learning_rate": 3.427419354838709e-05, "loss": 1.0894, "step": 170 }, { "epoch": 1.4428857715430863, "grad_norm": 12.192733764648438, "learning_rate": 3.6290322580645165e-05, "loss": 1.0552, "step": 180 }, { "epoch": 1.5230460921843687, "grad_norm": 7.907654762268066, "learning_rate": 3.8306451612903224e-05, "loss": 1.0559, "step": 190 }, { "epoch": 1.6032064128256514, "grad_norm": 8.524667739868164, "learning_rate": 4.032258064516129e-05, "loss": 1.0315, "step": 200 }, { "epoch": 1.6833667334669338, "grad_norm": 9.733327865600586, "learning_rate": 4.2338709677419356e-05, "loss": 1.0234, "step": 210 }, { "epoch": 1.7635270541082164, "grad_norm": 8.974815368652344, "learning_rate": 4.435483870967742e-05, "loss": 0.9642, "step": 220 }, { "epoch": 1.843687374749499, "grad_norm": 10.060734748840332, "learning_rate": 4.637096774193548e-05, "loss": 0.9358, "step": 230 }, { "epoch": 1.9238476953907817, "grad_norm": 10.902128219604492, "learning_rate": 4.8387096774193554e-05, "loss": 0.9268, "step": 240 }, { "epoch": 1.9959919839679359, "eval_accuracy": 0.7497181510710259, "eval_loss": 0.7163556814193726, "eval_runtime": 15.1868, "eval_samples_per_second": 116.812, "eval_steps_per_second": 3.687, "step": 249 }, { "epoch": 2.004008016032064, "grad_norm": 8.283767700195312, "learning_rate": 4.995519713261649e-05, "loss": 0.915, "step": 250 }, { "epoch": 2.0841683366733466, "grad_norm": 9.61968994140625, "learning_rate": 4.973118279569893e-05, "loss": 0.8681, "step": 260 }, { "epoch": 2.164328657314629, "grad_norm": 8.20541763305664, "learning_rate": 4.950716845878137e-05, "loss": 0.9177, "step": 270 }, { "epoch": 2.244488977955912, "grad_norm": 6.9433369636535645, "learning_rate": 4.92831541218638e-05, "loss": 0.8946, "step": 280 }, { "epoch": 2.3246492985971945, "grad_norm": 10.144634246826172, "learning_rate": 4.905913978494624e-05, "loss": 0.8933, "step": 290 }, { "epoch": 2.404809619238477, "grad_norm": 12.519510269165039, "learning_rate": 4.8835125448028677e-05, "loss": 0.8515, "step": 300 }, { "epoch": 2.4849699398797593, "grad_norm": 7.839648246765137, "learning_rate": 4.8611111111111115e-05, "loss": 0.9103, "step": 310 }, { "epoch": 2.565130260521042, "grad_norm": 9.056415557861328, "learning_rate": 4.8387096774193554e-05, "loss": 0.8601, "step": 320 }, { "epoch": 2.6452905811623246, "grad_norm": 10.264932632446289, "learning_rate": 4.8163082437275986e-05, "loss": 0.8363, "step": 330 }, { "epoch": 2.7254509018036073, "grad_norm": 8.888627052307129, "learning_rate": 4.7939068100358424e-05, "loss": 0.8075, "step": 340 }, { "epoch": 2.80561122244489, "grad_norm": 10.737099647521973, "learning_rate": 4.771505376344086e-05, "loss": 0.9107, "step": 350 }, { "epoch": 2.8857715430861726, "grad_norm": 7.076110363006592, "learning_rate": 4.74910394265233e-05, "loss": 0.7776, "step": 360 }, { "epoch": 2.9659318637274548, "grad_norm": 7.52149772644043, "learning_rate": 4.726702508960574e-05, "loss": 0.8221, "step": 370 }, { "epoch": 2.997995991983968, "eval_accuracy": 0.7874859075535513, "eval_loss": 0.6210038065910339, "eval_runtime": 16.893, "eval_samples_per_second": 105.014, "eval_steps_per_second": 3.315, "step": 374 }, { "epoch": 3.0460921843687374, "grad_norm": 10.58385181427002, "learning_rate": 4.704301075268818e-05, "loss": 0.8115, "step": 380 }, { "epoch": 3.12625250501002, "grad_norm": 11.055846214294434, "learning_rate": 4.681899641577061e-05, "loss": 0.7765, "step": 390 }, { "epoch": 3.2064128256513027, "grad_norm": 8.763452529907227, "learning_rate": 4.659498207885305e-05, "loss": 0.7307, "step": 400 }, { "epoch": 3.2865731462925853, "grad_norm": 6.804797172546387, "learning_rate": 4.637096774193548e-05, "loss": 0.7876, "step": 410 }, { "epoch": 3.3667334669338675, "grad_norm": 6.112203121185303, "learning_rate": 4.614695340501792e-05, "loss": 0.7391, "step": 420 }, { "epoch": 3.44689378757515, "grad_norm": 8.772920608520508, "learning_rate": 4.5922939068100365e-05, "loss": 0.7453, "step": 430 }, { "epoch": 3.527054108216433, "grad_norm": 5.974344253540039, "learning_rate": 4.56989247311828e-05, "loss": 0.7345, "step": 440 }, { "epoch": 3.6072144288577155, "grad_norm": 8.748202323913574, "learning_rate": 4.5474910394265236e-05, "loss": 0.8431, "step": 450 }, { "epoch": 3.687374749498998, "grad_norm": 8.03186321258545, "learning_rate": 4.5250896057347674e-05, "loss": 0.7337, "step": 460 }, { "epoch": 3.7675350701402808, "grad_norm": 7.393523216247559, "learning_rate": 4.5026881720430106e-05, "loss": 0.7451, "step": 470 }, { "epoch": 3.847695390781563, "grad_norm": 6.694340705871582, "learning_rate": 4.4802867383512545e-05, "loss": 0.8017, "step": 480 }, { "epoch": 3.9278557114228456, "grad_norm": 6.493546962738037, "learning_rate": 4.4578853046594983e-05, "loss": 0.7276, "step": 490 }, { "epoch": 4.0, "eval_accuracy": 0.8162344983089064, "eval_loss": 0.5563604235649109, "eval_runtime": 20.2707, "eval_samples_per_second": 87.515, "eval_steps_per_second": 2.763, "step": 499 }, { "epoch": 4.008016032064128, "grad_norm": 6.828517913818359, "learning_rate": 4.435483870967742e-05, "loss": 0.7295, "step": 500 }, { "epoch": 4.0881763527054105, "grad_norm": 8.60299301147461, "learning_rate": 4.413082437275986e-05, "loss": 0.7452, "step": 510 }, { "epoch": 4.168336673346693, "grad_norm": 7.014772415161133, "learning_rate": 4.390681003584229e-05, "loss": 0.7108, "step": 520 }, { "epoch": 4.248496993987976, "grad_norm": 10.6813325881958, "learning_rate": 4.368279569892473e-05, "loss": 0.7479, "step": 530 }, { "epoch": 4.328657314629258, "grad_norm": 6.949085712432861, "learning_rate": 4.345878136200717e-05, "loss": 0.6518, "step": 540 }, { "epoch": 4.408817635270541, "grad_norm": 6.594024658203125, "learning_rate": 4.323476702508961e-05, "loss": 0.7169, "step": 550 }, { "epoch": 4.488977955911824, "grad_norm": 8.18333625793457, "learning_rate": 4.301075268817205e-05, "loss": 0.7185, "step": 560 }, { "epoch": 4.569138276553106, "grad_norm": 5.988694190979004, "learning_rate": 4.2786738351254486e-05, "loss": 0.6685, "step": 570 }, { "epoch": 4.649298597194389, "grad_norm": 7.308699607849121, "learning_rate": 4.256272401433692e-05, "loss": 0.6542, "step": 580 }, { "epoch": 4.729458917835672, "grad_norm": 7.458045959472656, "learning_rate": 4.2338709677419356e-05, "loss": 0.6993, "step": 590 }, { "epoch": 4.809619238476954, "grad_norm": 8.139283180236816, "learning_rate": 4.2114695340501795e-05, "loss": 0.7078, "step": 600 }, { "epoch": 4.889779559118237, "grad_norm": 6.669909954071045, "learning_rate": 4.1890681003584233e-05, "loss": 0.6627, "step": 610 }, { "epoch": 4.969939879759519, "grad_norm": 6.659294605255127, "learning_rate": 4.166666666666667e-05, "loss": 0.6425, "step": 620 }, { "epoch": 4.993987975951904, "eval_accuracy": 0.8162344983089064, "eval_loss": 0.5226049423217773, "eval_runtime": 13.1794, "eval_samples_per_second": 134.604, "eval_steps_per_second": 4.249, "step": 623 }, { "epoch": 5.050100200400801, "grad_norm": 6.521961212158203, "learning_rate": 4.1442652329749104e-05, "loss": 0.6918, "step": 630 }, { "epoch": 5.130260521042084, "grad_norm": 8.646223068237305, "learning_rate": 4.121863799283154e-05, "loss": 0.6712, "step": 640 }, { "epoch": 5.210420841683367, "grad_norm": 5.398332118988037, "learning_rate": 4.099462365591398e-05, "loss": 0.6729, "step": 650 }, { "epoch": 5.290581162324649, "grad_norm": 8.448481559753418, "learning_rate": 4.077060931899642e-05, "loss": 0.653, "step": 660 }, { "epoch": 5.370741482965932, "grad_norm": 6.562283992767334, "learning_rate": 4.054659498207886e-05, "loss": 0.6835, "step": 670 }, { "epoch": 5.4509018036072145, "grad_norm": 8.798486709594727, "learning_rate": 4.032258064516129e-05, "loss": 0.6849, "step": 680 }, { "epoch": 5.531062124248497, "grad_norm": 8.284408569335938, "learning_rate": 4.009856630824373e-05, "loss": 0.6649, "step": 690 }, { "epoch": 5.61122244488978, "grad_norm": 6.30844259262085, "learning_rate": 3.987455197132617e-05, "loss": 0.6893, "step": 700 }, { "epoch": 5.6913827655310625, "grad_norm": 7.219119548797607, "learning_rate": 3.96505376344086e-05, "loss": 0.7007, "step": 710 }, { "epoch": 5.771543086172345, "grad_norm": 8.133257865905762, "learning_rate": 3.9426523297491045e-05, "loss": 0.6595, "step": 720 }, { "epoch": 5.851703406813627, "grad_norm": 7.415875434875488, "learning_rate": 3.9202508960573483e-05, "loss": 0.628, "step": 730 }, { "epoch": 5.9318637274549095, "grad_norm": 7.713524341583252, "learning_rate": 3.8978494623655915e-05, "loss": 0.6518, "step": 740 }, { "epoch": 5.995991983967936, "eval_accuracy": 0.818489289740699, "eval_loss": 0.537726879119873, "eval_runtime": 20.02, "eval_samples_per_second": 88.611, "eval_steps_per_second": 2.797, "step": 748 }, { "epoch": 6.012024048096192, "grad_norm": 7.509452819824219, "learning_rate": 3.8754480286738354e-05, "loss": 0.6262, "step": 750 }, { "epoch": 6.092184368737475, "grad_norm": 6.618509769439697, "learning_rate": 3.8530465949820786e-05, "loss": 0.6559, "step": 760 }, { "epoch": 6.1723446893787575, "grad_norm": 5.969357490539551, "learning_rate": 3.8306451612903224e-05, "loss": 0.6324, "step": 770 }, { "epoch": 6.25250501002004, "grad_norm": 7.1045918464660645, "learning_rate": 3.808243727598566e-05, "loss": 0.5872, "step": 780 }, { "epoch": 6.332665330661323, "grad_norm": 6.669059753417969, "learning_rate": 3.78584229390681e-05, "loss": 0.581, "step": 790 }, { "epoch": 6.412825651302605, "grad_norm": 7.305534839630127, "learning_rate": 3.763440860215054e-05, "loss": 0.6319, "step": 800 }, { "epoch": 6.492985971943888, "grad_norm": 5.811188697814941, "learning_rate": 3.741039426523298e-05, "loss": 0.6461, "step": 810 }, { "epoch": 6.573146292585171, "grad_norm": 5.710335731506348, "learning_rate": 3.718637992831541e-05, "loss": 0.6673, "step": 820 }, { "epoch": 6.653306613226453, "grad_norm": 8.094255447387695, "learning_rate": 3.696236559139785e-05, "loss": 0.6506, "step": 830 }, { "epoch": 6.733466933867735, "grad_norm": 6.600334644317627, "learning_rate": 3.673835125448029e-05, "loss": 0.6437, "step": 840 }, { "epoch": 6.813627254509018, "grad_norm": 7.381925106048584, "learning_rate": 3.651433691756273e-05, "loss": 0.6173, "step": 850 }, { "epoch": 6.8937875751503, "grad_norm": 7.205611705780029, "learning_rate": 3.6290322580645165e-05, "loss": 0.6427, "step": 860 }, { "epoch": 6.973947895791583, "grad_norm": 6.106593608856201, "learning_rate": 3.60663082437276e-05, "loss": 0.6096, "step": 870 }, { "epoch": 6.997995991983968, "eval_accuracy": 0.8218714768883878, "eval_loss": 0.5341029167175293, "eval_runtime": 13.1967, "eval_samples_per_second": 134.427, "eval_steps_per_second": 4.243, "step": 873 }, { "epoch": 7.054108216432866, "grad_norm": 7.432844161987305, "learning_rate": 3.5842293906810036e-05, "loss": 0.6373, "step": 880 }, { "epoch": 7.134268537074148, "grad_norm": 7.420022010803223, "learning_rate": 3.5618279569892474e-05, "loss": 0.6481, "step": 890 }, { "epoch": 7.214428857715431, "grad_norm": 7.222751617431641, "learning_rate": 3.539426523297491e-05, "loss": 0.6267, "step": 900 }, { "epoch": 7.294589178356714, "grad_norm": 7.050006866455078, "learning_rate": 3.517025089605735e-05, "loss": 0.6107, "step": 910 }, { "epoch": 7.374749498997996, "grad_norm": 8.168829917907715, "learning_rate": 3.494623655913979e-05, "loss": 0.6008, "step": 920 }, { "epoch": 7.454909819639279, "grad_norm": 5.229215145111084, "learning_rate": 3.472222222222222e-05, "loss": 0.6355, "step": 930 }, { "epoch": 7.5350701402805615, "grad_norm": 6.677180290222168, "learning_rate": 3.449820788530466e-05, "loss": 0.6021, "step": 940 }, { "epoch": 7.615230460921843, "grad_norm": 5.7284698486328125, "learning_rate": 3.427419354838709e-05, "loss": 0.602, "step": 950 }, { "epoch": 7.695390781563126, "grad_norm": 7.613159656524658, "learning_rate": 3.405017921146954e-05, "loss": 0.578, "step": 960 }, { "epoch": 7.775551102204409, "grad_norm": 7.990455150604248, "learning_rate": 3.382616487455198e-05, "loss": 0.6064, "step": 970 }, { "epoch": 7.855711422845691, "grad_norm": 7.88253116607666, "learning_rate": 3.360215053763441e-05, "loss": 0.5796, "step": 980 }, { "epoch": 7.935871743486974, "grad_norm": 6.4569091796875, "learning_rate": 3.337813620071685e-05, "loss": 0.6282, "step": 990 }, { "epoch": 8.0, "eval_accuracy": 0.8399098083427283, "eval_loss": 0.4718434512615204, "eval_runtime": 18.7441, "eval_samples_per_second": 94.643, "eval_steps_per_second": 2.988, "step": 998 }, { "epoch": 8.016032064128256, "grad_norm": 6.175160884857178, "learning_rate": 3.3154121863799286e-05, "loss": 0.5892, "step": 1000 }, { "epoch": 8.09619238476954, "grad_norm": 6.699339389801025, "learning_rate": 3.293010752688172e-05, "loss": 0.5914, "step": 1010 }, { "epoch": 8.176352705410821, "grad_norm": 7.189827919006348, "learning_rate": 3.270609318996416e-05, "loss": 0.5755, "step": 1020 }, { "epoch": 8.256513026052104, "grad_norm": 7.274308204650879, "learning_rate": 3.24820788530466e-05, "loss": 0.6348, "step": 1030 }, { "epoch": 8.336673346693386, "grad_norm": 5.570709228515625, "learning_rate": 3.2258064516129034e-05, "loss": 0.589, "step": 1040 }, { "epoch": 8.41683366733467, "grad_norm": 6.513092994689941, "learning_rate": 3.203405017921147e-05, "loss": 0.6085, "step": 1050 }, { "epoch": 8.496993987975952, "grad_norm": 6.354593753814697, "learning_rate": 3.1810035842293904e-05, "loss": 0.5925, "step": 1060 }, { "epoch": 8.577154308617235, "grad_norm": 6.1644392013549805, "learning_rate": 3.158602150537634e-05, "loss": 0.5287, "step": 1070 }, { "epoch": 8.657314629258517, "grad_norm": 7.383876800537109, "learning_rate": 3.136200716845878e-05, "loss": 0.6475, "step": 1080 }, { "epoch": 8.7374749498998, "grad_norm": 9.010411262512207, "learning_rate": 3.113799283154122e-05, "loss": 0.5934, "step": 1090 }, { "epoch": 8.817635270541082, "grad_norm": 5.401876449584961, "learning_rate": 3.091397849462366e-05, "loss": 0.5658, "step": 1100 }, { "epoch": 8.897795591182366, "grad_norm": 5.095533847808838, "learning_rate": 3.06899641577061e-05, "loss": 0.5791, "step": 1110 }, { "epoch": 8.977955911823647, "grad_norm": 6.6335248947143555, "learning_rate": 3.046594982078853e-05, "loss": 0.5394, "step": 1120 }, { "epoch": 8.993987975951903, "eval_accuracy": 0.8280721533258174, "eval_loss": 0.5112709999084473, "eval_runtime": 25.2314, "eval_samples_per_second": 70.309, "eval_steps_per_second": 2.219, "step": 1122 }, { "epoch": 9.05811623246493, "grad_norm": 6.276222229003906, "learning_rate": 3.024193548387097e-05, "loss": 0.586, "step": 1130 }, { "epoch": 9.138276553106213, "grad_norm": 5.882013320922852, "learning_rate": 3.0017921146953403e-05, "loss": 0.6203, "step": 1140 }, { "epoch": 9.218436873747494, "grad_norm": 7.025397777557373, "learning_rate": 2.979390681003584e-05, "loss": 0.5429, "step": 1150 }, { "epoch": 9.298597194388778, "grad_norm": 5.535187244415283, "learning_rate": 2.9569892473118284e-05, "loss": 0.5571, "step": 1160 }, { "epoch": 9.37875751503006, "grad_norm": 7.409646987915039, "learning_rate": 2.9345878136200715e-05, "loss": 0.5557, "step": 1170 }, { "epoch": 9.458917835671343, "grad_norm": 6.127359390258789, "learning_rate": 2.9121863799283154e-05, "loss": 0.5341, "step": 1180 }, { "epoch": 9.539078156312625, "grad_norm": 5.265384674072266, "learning_rate": 2.8897849462365596e-05, "loss": 0.5994, "step": 1190 }, { "epoch": 9.619238476953909, "grad_norm": 6.010611534118652, "learning_rate": 2.8673835125448028e-05, "loss": 0.5504, "step": 1200 }, { "epoch": 9.69939879759519, "grad_norm": 7.495913505554199, "learning_rate": 2.8449820788530467e-05, "loss": 0.5807, "step": 1210 }, { "epoch": 9.779559118236474, "grad_norm": 8.004411697387695, "learning_rate": 2.822580645161291e-05, "loss": 0.5911, "step": 1220 }, { "epoch": 9.859719438877756, "grad_norm": 7.179277420043945, "learning_rate": 2.800179211469534e-05, "loss": 0.5128, "step": 1230 }, { "epoch": 9.939879759519037, "grad_norm": 6.937490940093994, "learning_rate": 2.777777777777778e-05, "loss": 0.5718, "step": 1240 }, { "epoch": 9.995991983967937, "eval_accuracy": 0.8291995490417137, "eval_loss": 0.5018876791000366, "eval_runtime": 16.3385, "eval_samples_per_second": 108.578, "eval_steps_per_second": 3.427, "step": 1247 }, { "epoch": 10.02004008016032, "grad_norm": 7.811807632446289, "learning_rate": 2.7553763440860214e-05, "loss": 0.5727, "step": 1250 }, { "epoch": 10.100200400801603, "grad_norm": 7.441296577453613, "learning_rate": 2.7329749103942653e-05, "loss": 0.5742, "step": 1260 }, { "epoch": 10.180360721442886, "grad_norm": 6.93259334564209, "learning_rate": 2.710573476702509e-05, "loss": 0.4814, "step": 1270 }, { "epoch": 10.260521042084168, "grad_norm": 7.736974716186523, "learning_rate": 2.6881720430107527e-05, "loss": 0.5448, "step": 1280 }, { "epoch": 10.340681362725451, "grad_norm": 7.408446788787842, "learning_rate": 2.6657706093189965e-05, "loss": 0.5892, "step": 1290 }, { "epoch": 10.420841683366733, "grad_norm": 6.906106472015381, "learning_rate": 2.6433691756272404e-05, "loss": 0.5175, "step": 1300 }, { "epoch": 10.501002004008017, "grad_norm": 5.426215648651123, "learning_rate": 2.620967741935484e-05, "loss": 0.5977, "step": 1310 }, { "epoch": 10.581162324649299, "grad_norm": 5.591187477111816, "learning_rate": 2.5985663082437278e-05, "loss": 0.5157, "step": 1320 }, { "epoch": 10.661322645290582, "grad_norm": 7.416080474853516, "learning_rate": 2.5761648745519713e-05, "loss": 0.5578, "step": 1330 }, { "epoch": 10.741482965931864, "grad_norm": 6.815114498138428, "learning_rate": 2.5537634408602152e-05, "loss": 0.5387, "step": 1340 }, { "epoch": 10.821643286573146, "grad_norm": 8.681703567504883, "learning_rate": 2.531362007168459e-05, "loss": 0.5299, "step": 1350 }, { "epoch": 10.901803607214429, "grad_norm": 5.358316421508789, "learning_rate": 2.5089605734767026e-05, "loss": 0.59, "step": 1360 }, { "epoch": 10.98196392785571, "grad_norm": 8.894550323486328, "learning_rate": 2.4865591397849464e-05, "loss": 0.5507, "step": 1370 }, { "epoch": 10.997995991983968, "eval_accuracy": 0.8461104847801578, "eval_loss": 0.4545128643512726, "eval_runtime": 17.189, "eval_samples_per_second": 103.206, "eval_steps_per_second": 3.258, "step": 1372 }, { "epoch": 11.062124248496994, "grad_norm": 6.441153526306152, "learning_rate": 2.46415770609319e-05, "loss": 0.5485, "step": 1380 }, { "epoch": 11.142284569138276, "grad_norm": 5.337042808532715, "learning_rate": 2.4417562724014338e-05, "loss": 0.4986, "step": 1390 }, { "epoch": 11.22244488977956, "grad_norm": 5.681359767913818, "learning_rate": 2.4193548387096777e-05, "loss": 0.5213, "step": 1400 }, { "epoch": 11.302605210420841, "grad_norm": 6.7940778732299805, "learning_rate": 2.3969534050179212e-05, "loss": 0.5415, "step": 1410 }, { "epoch": 11.382765531062125, "grad_norm": 5.4109930992126465, "learning_rate": 2.374551971326165e-05, "loss": 0.5748, "step": 1420 }, { "epoch": 11.462925851703407, "grad_norm": 7.78901481628418, "learning_rate": 2.352150537634409e-05, "loss": 0.528, "step": 1430 }, { "epoch": 11.54308617234469, "grad_norm": 5.304915904998779, "learning_rate": 2.3297491039426525e-05, "loss": 0.5709, "step": 1440 }, { "epoch": 11.623246492985972, "grad_norm": 6.8759846687316895, "learning_rate": 2.307347670250896e-05, "loss": 0.5708, "step": 1450 }, { "epoch": 11.703406813627254, "grad_norm": 5.734496593475342, "learning_rate": 2.28494623655914e-05, "loss": 0.5178, "step": 1460 }, { "epoch": 11.783567134268537, "grad_norm": 7.169252395629883, "learning_rate": 2.2625448028673837e-05, "loss": 0.5595, "step": 1470 }, { "epoch": 11.863727454909819, "grad_norm": 6.391491413116455, "learning_rate": 2.2401433691756272e-05, "loss": 0.5883, "step": 1480 }, { "epoch": 11.943887775551103, "grad_norm": 5.931715965270996, "learning_rate": 2.217741935483871e-05, "loss": 0.4921, "step": 1490 }, { "epoch": 12.0, "eval_accuracy": 0.8416009019165727, "eval_loss": 0.46128037571907043, "eval_runtime": 27.7579, "eval_samples_per_second": 63.91, "eval_steps_per_second": 2.017, "step": 1497 }, { "epoch": 12.024048096192384, "grad_norm": 5.848583221435547, "learning_rate": 2.1953405017921146e-05, "loss": 0.5388, "step": 1500 }, { "epoch": 12.104208416833668, "grad_norm": 5.273708343505859, "learning_rate": 2.1729390681003585e-05, "loss": 0.5259, "step": 1510 }, { "epoch": 12.18436873747495, "grad_norm": 6.022935390472412, "learning_rate": 2.1505376344086024e-05, "loss": 0.4928, "step": 1520 }, { "epoch": 12.264529058116233, "grad_norm": 4.965794563293457, "learning_rate": 2.128136200716846e-05, "loss": 0.537, "step": 1530 }, { "epoch": 12.344689378757515, "grad_norm": 6.983731746673584, "learning_rate": 2.1057347670250897e-05, "loss": 0.5258, "step": 1540 }, { "epoch": 12.424849699398798, "grad_norm": 6.290835380554199, "learning_rate": 2.0833333333333336e-05, "loss": 0.5411, "step": 1550 }, { "epoch": 12.50501002004008, "grad_norm": 6.071152210235596, "learning_rate": 2.060931899641577e-05, "loss": 0.5421, "step": 1560 }, { "epoch": 12.585170340681362, "grad_norm": 7.99808931350708, "learning_rate": 2.038530465949821e-05, "loss": 0.5741, "step": 1570 }, { "epoch": 12.665330661322646, "grad_norm": 7.839056015014648, "learning_rate": 2.0161290322580645e-05, "loss": 0.5069, "step": 1580 }, { "epoch": 12.745490981963927, "grad_norm": 6.645950794219971, "learning_rate": 1.9937275985663084e-05, "loss": 0.5598, "step": 1590 }, { "epoch": 12.82565130260521, "grad_norm": 6.195275783538818, "learning_rate": 1.9713261648745522e-05, "loss": 0.5128, "step": 1600 }, { "epoch": 12.905811623246493, "grad_norm": 6.307319164276123, "learning_rate": 1.9489247311827958e-05, "loss": 0.5218, "step": 1610 }, { "epoch": 12.985971943887776, "grad_norm": 5.337151527404785, "learning_rate": 1.9265232974910393e-05, "loss": 0.5571, "step": 1620 }, { "epoch": 12.993987975951903, "eval_accuracy": 0.8416009019165727, "eval_loss": 0.45865094661712646, "eval_runtime": 13.0045, "eval_samples_per_second": 136.414, "eval_steps_per_second": 4.306, "step": 1621 }, { "epoch": 13.066132264529058, "grad_norm": 5.886476516723633, "learning_rate": 1.904121863799283e-05, "loss": 0.5145, "step": 1630 }, { "epoch": 13.146292585170341, "grad_norm": 7.12263298034668, "learning_rate": 1.881720430107527e-05, "loss": 0.4775, "step": 1640 }, { "epoch": 13.226452905811623, "grad_norm": 6.896437168121338, "learning_rate": 1.8593189964157705e-05, "loss": 0.4922, "step": 1650 }, { "epoch": 13.306613226452907, "grad_norm": 7.87682580947876, "learning_rate": 1.8369175627240144e-05, "loss": 0.5151, "step": 1660 }, { "epoch": 13.386773547094188, "grad_norm": 6.32350492477417, "learning_rate": 1.8145161290322583e-05, "loss": 0.5308, "step": 1670 }, { "epoch": 13.46693386773547, "grad_norm": 6.5004353523254395, "learning_rate": 1.7921146953405018e-05, "loss": 0.4914, "step": 1680 }, { "epoch": 13.547094188376754, "grad_norm": 6.300237655639648, "learning_rate": 1.7697132616487457e-05, "loss": 0.5254, "step": 1690 }, { "epoch": 13.627254509018035, "grad_norm": 6.251715660095215, "learning_rate": 1.7473118279569895e-05, "loss": 0.5197, "step": 1700 }, { "epoch": 13.707414829659319, "grad_norm": 8.49095630645752, "learning_rate": 1.724910394265233e-05, "loss": 0.5269, "step": 1710 }, { "epoch": 13.7875751503006, "grad_norm": 6.206210613250732, "learning_rate": 1.702508960573477e-05, "loss": 0.5317, "step": 1720 }, { "epoch": 13.867735470941884, "grad_norm": 6.281041145324707, "learning_rate": 1.6801075268817204e-05, "loss": 0.5212, "step": 1730 }, { "epoch": 13.947895791583166, "grad_norm": 5.863707542419434, "learning_rate": 1.6577060931899643e-05, "loss": 0.512, "step": 1740 }, { "epoch": 13.995991983967937, "eval_accuracy": 0.8511837655016911, "eval_loss": 0.46732643246650696, "eval_runtime": 20.8784, "eval_samples_per_second": 84.968, "eval_steps_per_second": 2.682, "step": 1746 }, { "epoch": 14.02805611222445, "grad_norm": 7.321824550628662, "learning_rate": 1.635304659498208e-05, "loss": 0.5023, "step": 1750 }, { "epoch": 14.108216432865731, "grad_norm": 7.074238300323486, "learning_rate": 1.6129032258064517e-05, "loss": 0.5645, "step": 1760 }, { "epoch": 14.188376753507015, "grad_norm": 4.364939212799072, "learning_rate": 1.5905017921146952e-05, "loss": 0.4858, "step": 1770 }, { "epoch": 14.268537074148297, "grad_norm": 6.330202102661133, "learning_rate": 1.568100358422939e-05, "loss": 0.5145, "step": 1780 }, { "epoch": 14.348697394789578, "grad_norm": 6.827199935913086, "learning_rate": 1.545698924731183e-05, "loss": 0.4995, "step": 1790 }, { "epoch": 14.428857715430862, "grad_norm": 5.992321014404297, "learning_rate": 1.5232974910394265e-05, "loss": 0.511, "step": 1800 }, { "epoch": 14.509018036072144, "grad_norm": 6.993434429168701, "learning_rate": 1.5008960573476701e-05, "loss": 0.535, "step": 1810 }, { "epoch": 14.589178356713427, "grad_norm": 6.39487886428833, "learning_rate": 1.4784946236559142e-05, "loss": 0.4754, "step": 1820 }, { "epoch": 14.669338677354709, "grad_norm": 7.741076946258545, "learning_rate": 1.4560931899641577e-05, "loss": 0.488, "step": 1830 }, { "epoch": 14.749498997995993, "grad_norm": 6.242033958435059, "learning_rate": 1.4336917562724014e-05, "loss": 0.4965, "step": 1840 }, { "epoch": 14.829659318637274, "grad_norm": 7.820639610290527, "learning_rate": 1.4112903225806454e-05, "loss": 0.5113, "step": 1850 }, { "epoch": 14.909819639278558, "grad_norm": 5.047755718231201, "learning_rate": 1.388888888888889e-05, "loss": 0.4945, "step": 1860 }, { "epoch": 14.98997995991984, "grad_norm": 6.295690059661865, "learning_rate": 1.3664874551971326e-05, "loss": 0.4855, "step": 1870 }, { "epoch": 14.997995991983968, "eval_accuracy": 0.8489289740698985, "eval_loss": 0.4640846252441406, "eval_runtime": 12.5307, "eval_samples_per_second": 141.573, "eval_steps_per_second": 4.469, "step": 1871 }, { "epoch": 15.070140280561123, "grad_norm": 5.256791114807129, "learning_rate": 1.3440860215053763e-05, "loss": 0.4476, "step": 1880 }, { "epoch": 15.150300601202405, "grad_norm": 6.767005920410156, "learning_rate": 1.3216845878136202e-05, "loss": 0.5136, "step": 1890 }, { "epoch": 15.230460921843687, "grad_norm": 6.730881690979004, "learning_rate": 1.2992831541218639e-05, "loss": 0.4952, "step": 1900 }, { "epoch": 15.31062124248497, "grad_norm": 5.721596717834473, "learning_rate": 1.2768817204301076e-05, "loss": 0.4984, "step": 1910 }, { "epoch": 15.390781563126252, "grad_norm": 5.367898941040039, "learning_rate": 1.2544802867383513e-05, "loss": 0.5366, "step": 1920 }, { "epoch": 15.470941883767535, "grad_norm": 7.774703502655029, "learning_rate": 1.232078853046595e-05, "loss": 0.4651, "step": 1930 }, { "epoch": 15.551102204408817, "grad_norm": 5.738451957702637, "learning_rate": 1.2096774193548388e-05, "loss": 0.4803, "step": 1940 }, { "epoch": 15.6312625250501, "grad_norm": 8.123086929321289, "learning_rate": 1.1872759856630825e-05, "loss": 0.5122, "step": 1950 }, { "epoch": 15.711422845691382, "grad_norm": 7.48280668258667, "learning_rate": 1.1648745519713262e-05, "loss": 0.4913, "step": 1960 }, { "epoch": 15.791583166332666, "grad_norm": 6.538034439086914, "learning_rate": 1.14247311827957e-05, "loss": 0.4653, "step": 1970 }, { "epoch": 15.871743486973948, "grad_norm": 6.203965663909912, "learning_rate": 1.1200716845878136e-05, "loss": 0.5026, "step": 1980 }, { "epoch": 15.951903807615231, "grad_norm": 7.2120490074157715, "learning_rate": 1.0976702508960573e-05, "loss": 0.4895, "step": 1990 }, { "epoch": 16.0, "eval_accuracy": 0.8449830890642616, "eval_loss": 0.4555908143520355, "eval_runtime": 23.4028, "eval_samples_per_second": 75.803, "eval_steps_per_second": 2.393, "step": 1996 }, { "epoch": 16.03206412825651, "grad_norm": 5.893616199493408, "learning_rate": 1.0752688172043012e-05, "loss": 0.4819, "step": 2000 }, { "epoch": 16.112224448897795, "grad_norm": 7.0060133934021, "learning_rate": 1.0528673835125449e-05, "loss": 0.4928, "step": 2010 }, { "epoch": 16.19238476953908, "grad_norm": 5.684309005737305, "learning_rate": 1.0304659498207886e-05, "loss": 0.441, "step": 2020 }, { "epoch": 16.272545090180362, "grad_norm": 7.170827865600586, "learning_rate": 1.0080645161290323e-05, "loss": 0.4686, "step": 2030 }, { "epoch": 16.352705410821642, "grad_norm": 6.788947105407715, "learning_rate": 9.856630824372761e-06, "loss": 0.509, "step": 2040 }, { "epoch": 16.432865731462925, "grad_norm": 7.052069187164307, "learning_rate": 9.632616487455196e-06, "loss": 0.4727, "step": 2050 }, { "epoch": 16.51302605210421, "grad_norm": 6.445401668548584, "learning_rate": 9.408602150537635e-06, "loss": 0.5347, "step": 2060 }, { "epoch": 16.593186372745492, "grad_norm": 9.481761932373047, "learning_rate": 9.184587813620072e-06, "loss": 0.5089, "step": 2070 }, { "epoch": 16.673346693386772, "grad_norm": 5.852792739868164, "learning_rate": 8.960573476702509e-06, "loss": 0.4969, "step": 2080 }, { "epoch": 16.753507014028056, "grad_norm": 6.045396327972412, "learning_rate": 8.736559139784948e-06, "loss": 0.522, "step": 2090 }, { "epoch": 16.83366733466934, "grad_norm": 6.488787651062012, "learning_rate": 8.512544802867385e-06, "loss": 0.4813, "step": 2100 }, { "epoch": 16.91382765531062, "grad_norm": 7.611959934234619, "learning_rate": 8.288530465949821e-06, "loss": 0.4662, "step": 2110 }, { "epoch": 16.993987975951903, "grad_norm": 7.92677640914917, "learning_rate": 8.064516129032258e-06, "loss": 0.4809, "step": 2120 }, { "epoch": 16.993987975951903, "eval_accuracy": 0.8523111612175873, "eval_loss": 0.4317234456539154, "eval_runtime": 20.2324, "eval_samples_per_second": 87.681, "eval_steps_per_second": 2.768, "step": 2120 }, { "epoch": 17.074148296593187, "grad_norm": 6.190919876098633, "learning_rate": 7.840501792114695e-06, "loss": 0.4918, "step": 2130 }, { "epoch": 17.15430861723447, "grad_norm": 6.092956066131592, "learning_rate": 7.616487455197132e-06, "loss": 0.4602, "step": 2140 }, { "epoch": 17.23446893787575, "grad_norm": 7.813562870025635, "learning_rate": 7.392473118279571e-06, "loss": 0.5014, "step": 2150 }, { "epoch": 17.314629258517034, "grad_norm": 7.895810127258301, "learning_rate": 7.168458781362007e-06, "loss": 0.4704, "step": 2160 }, { "epoch": 17.394789579158317, "grad_norm": 7.144327640533447, "learning_rate": 6.944444444444445e-06, "loss": 0.5298, "step": 2170 }, { "epoch": 17.4749498997996, "grad_norm": 6.57069730758667, "learning_rate": 6.720430107526882e-06, "loss": 0.4797, "step": 2180 }, { "epoch": 17.55511022044088, "grad_norm": 5.174849510192871, "learning_rate": 6.4964157706093195e-06, "loss": 0.4697, "step": 2190 }, { "epoch": 17.635270541082164, "grad_norm": 4.489311218261719, "learning_rate": 6.2724014336917564e-06, "loss": 0.4764, "step": 2200 }, { "epoch": 17.715430861723448, "grad_norm": 8.91657829284668, "learning_rate": 6.048387096774194e-06, "loss": 0.4229, "step": 2210 }, { "epoch": 17.79559118236473, "grad_norm": 8.482898712158203, "learning_rate": 5.824372759856631e-06, "loss": 0.4835, "step": 2220 }, { "epoch": 17.87575150300601, "grad_norm": 7.158608436584473, "learning_rate": 5.600358422939068e-06, "loss": 0.4764, "step": 2230 }, { "epoch": 17.955911823647295, "grad_norm": 7.100325107574463, "learning_rate": 5.376344086021506e-06, "loss": 0.4785, "step": 2240 }, { "epoch": 17.995991983967937, "eval_accuracy": 0.8534385569334837, "eval_loss": 0.4337688088417053, "eval_runtime": 14.1747, "eval_samples_per_second": 125.152, "eval_steps_per_second": 3.951, "step": 2245 }, { "epoch": 18.03607214428858, "grad_norm": 6.301604747772217, "learning_rate": 5.152329749103943e-06, "loss": 0.4679, "step": 2250 }, { "epoch": 18.11623246492986, "grad_norm": 8.363497734069824, "learning_rate": 4.928315412186381e-06, "loss": 0.4164, "step": 2260 }, { "epoch": 18.196392785571142, "grad_norm": 6.71609354019165, "learning_rate": 4.7043010752688175e-06, "loss": 0.4596, "step": 2270 }, { "epoch": 18.276553106212425, "grad_norm": 5.690964221954346, "learning_rate": 4.4802867383512545e-06, "loss": 0.4774, "step": 2280 }, { "epoch": 18.35671342685371, "grad_norm": 7.212980270385742, "learning_rate": 4.256272401433692e-06, "loss": 0.5049, "step": 2290 }, { "epoch": 18.43687374749499, "grad_norm": 7.2161149978637695, "learning_rate": 4.032258064516129e-06, "loss": 0.4504, "step": 2300 }, { "epoch": 18.517034068136272, "grad_norm": 5.582963466644287, "learning_rate": 3.808243727598566e-06, "loss": 0.4624, "step": 2310 }, { "epoch": 18.597194388777556, "grad_norm": 6.577459335327148, "learning_rate": 3.5842293906810035e-06, "loss": 0.4636, "step": 2320 }, { "epoch": 18.677354709418836, "grad_norm": 6.8889594078063965, "learning_rate": 3.360215053763441e-06, "loss": 0.4418, "step": 2330 }, { "epoch": 18.75751503006012, "grad_norm": 7.460567951202393, "learning_rate": 3.1362007168458782e-06, "loss": 0.49, "step": 2340 }, { "epoch": 18.837675350701403, "grad_norm": 6.316689491271973, "learning_rate": 2.9121863799283156e-06, "loss": 0.4392, "step": 2350 }, { "epoch": 18.917835671342687, "grad_norm": 7.855792999267578, "learning_rate": 2.688172043010753e-06, "loss": 0.4779, "step": 2360 }, { "epoch": 18.997995991983966, "grad_norm": 5.641603946685791, "learning_rate": 2.4641577060931903e-06, "loss": 0.444, "step": 2370 }, { "epoch": 18.997995991983966, "eval_accuracy": 0.8579481397970687, "eval_loss": 0.4356663227081299, "eval_runtime": 12.8983, "eval_samples_per_second": 137.538, "eval_steps_per_second": 4.342, "step": 2370 }, { "epoch": 19.07815631262525, "grad_norm": 7.950003623962402, "learning_rate": 2.2401433691756272e-06, "loss": 0.4587, "step": 2380 }, { "epoch": 19.158316633266534, "grad_norm": 6.9599995613098145, "learning_rate": 2.0161290322580646e-06, "loss": 0.4754, "step": 2390 }, { "epoch": 19.238476953907817, "grad_norm": 6.827354907989502, "learning_rate": 1.7921146953405017e-06, "loss": 0.4576, "step": 2400 }, { "epoch": 19.318637274549097, "grad_norm": 5.975595951080322, "learning_rate": 1.5681003584229391e-06, "loss": 0.427, "step": 2410 }, { "epoch": 19.39879759519038, "grad_norm": 7.829305648803711, "learning_rate": 1.3440860215053765e-06, "loss": 0.5141, "step": 2420 }, { "epoch": 19.478957915831664, "grad_norm": 5.588257312774658, "learning_rate": 1.1200716845878136e-06, "loss": 0.4609, "step": 2430 }, { "epoch": 19.559118236472948, "grad_norm": 8.075860023498535, "learning_rate": 8.960573476702509e-07, "loss": 0.4977, "step": 2440 }, { "epoch": 19.639278557114228, "grad_norm": 7.977848052978516, "learning_rate": 6.720430107526882e-07, "loss": 0.4329, "step": 2450 }, { "epoch": 19.71943887775551, "grad_norm": 7.050076961517334, "learning_rate": 4.4802867383512544e-07, "loss": 0.4613, "step": 2460 }, { "epoch": 19.799599198396795, "grad_norm": 6.357409954071045, "learning_rate": 2.2401433691756272e-07, "loss": 0.4457, "step": 2470 }, { "epoch": 19.879759519038075, "grad_norm": 4.937966346740723, "learning_rate": 0.0, "loss": 0.4255, "step": 2480 }, { "epoch": 19.879759519038075, "eval_accuracy": 0.8585118376550169, "eval_loss": 0.4356611371040344, "eval_runtime": 26.523, "eval_samples_per_second": 66.885, "eval_steps_per_second": 2.111, "step": 2480 }, { "epoch": 19.879759519038075, "step": 2480, "total_flos": 1.0326291224762253e+19, "train_loss": 0.6660777115052746, "train_runtime": 7387.7979, "train_samples_per_second": 43.212, "train_steps_per_second": 0.336 } ], "logging_steps": 10, "max_steps": 2480, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 1.0326291224762253e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }