diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,7496 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.757632398753894, - "eval_steps": 151, - "global_step": 1057, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0016628559551028891, - "grad_norm": 6.876781701391548, - "learning_rate": 2e-07, - "loss": 1.2322, - "step": 1 - }, - { - "epoch": 0.0016628559551028891, - "eval_loss": 1.6686532497406006, - "eval_runtime": 4.7271, - "eval_samples_per_second": 2.327, - "eval_steps_per_second": 0.635, - "step": 1 - }, - { - "epoch": 0.0033257119102057782, - "grad_norm": 7.15147151737489, - "learning_rate": 4e-07, - "loss": 1.3856, - "step": 2 - }, - { - "epoch": 0.004988567865308668, - "grad_norm": 6.675277261967279, - "learning_rate": 6e-07, - "loss": 1.3114, - "step": 3 - }, - { - "epoch": 0.0066514238204115565, - "grad_norm": 6.276373071109286, - "learning_rate": 8e-07, - "loss": 1.2768, - "step": 4 - }, - { - "epoch": 0.008314279775514447, - "grad_norm": 6.004954010868464, - "learning_rate": 1e-06, - "loss": 1.3114, - "step": 5 - }, - { - "epoch": 0.009977135730617336, - "grad_norm": 6.210557996205687, - "learning_rate": 1.2e-06, - "loss": 1.3777, - "step": 6 - }, - { - "epoch": 0.011639991685720224, - "grad_norm": 6.22480460719047, - "learning_rate": 1.4e-06, - "loss": 1.1786, - "step": 7 - }, - { - "epoch": 0.013302847640823113, - "grad_norm": 4.853536830796908, - "learning_rate": 1.6e-06, - "loss": 1.432, - "step": 8 - }, - { - "epoch": 0.014965703595926003, - "grad_norm": 4.549188866460316, - "learning_rate": 1.8e-06, - "loss": 1.2102, - "step": 9 - }, - { - "epoch": 0.016628559551028894, - "grad_norm": 4.596321987039164, - "learning_rate": 2e-06, - "loss": 1.4431, - "step": 10 - }, - { - "epoch": 0.01829141550613178, - "grad_norm": 3.809686255936276, - "learning_rate": 1.9999984649975976e-06, - "loss": 1.1868, - "step": 11 - }, - { - "epoch": 0.01995427146123467, - "grad_norm": 3.831577778436731, - "learning_rate": 1.999993859995103e-06, - "loss": 1.2103, - "step": 12 - }, - { - "epoch": 0.021617127416337558, - "grad_norm": 3.462783579837251, - "learning_rate": 1.999986185006654e-06, - "loss": 1.2476, - "step": 13 - }, - { - "epoch": 0.02327998337144045, - "grad_norm": 3.3297536194169064, - "learning_rate": 1.999975440055812e-06, - "loss": 1.1819, - "step": 14 - }, - { - "epoch": 0.02494283932654334, - "grad_norm": 3.8443918916019637, - "learning_rate": 1.999961625175565e-06, - "loss": 1.1482, - "step": 15 - }, - { - "epoch": 0.026605695281646226, - "grad_norm": 4.209422853092109, - "learning_rate": 1.999944740408324e-06, - "loss": 1.2177, - "step": 16 - }, - { - "epoch": 0.028268551236749116, - "grad_norm": 3.6743224915375583, - "learning_rate": 1.9999247858059257e-06, - "loss": 1.3183, - "step": 17 - }, - { - "epoch": 0.029931407191852007, - "grad_norm": 3.387454350282322, - "learning_rate": 1.999901761429631e-06, - "loss": 1.1455, - "step": 18 - }, - { - "epoch": 0.0315942631469549, - "grad_norm": 2.9657179454835356, - "learning_rate": 1.9998756673501237e-06, - "loss": 1.1463, - "step": 19 - }, - { - "epoch": 0.03325711910205779, - "grad_norm": 3.455180779074787, - "learning_rate": 1.9998465036475145e-06, - "loss": 1.1571, - "step": 20 - }, - { - "epoch": 0.03491997505716067, - "grad_norm": 2.787618991649651, - "learning_rate": 1.9998142704113346e-06, - "loss": 1.2552, - "step": 21 - }, - { - "epoch": 0.03658283101226356, - "grad_norm": 2.765977452887127, - "learning_rate": 1.9997789677405414e-06, - "loss": 1.2326, - "step": 22 - }, - { - "epoch": 0.03824568696736645, - "grad_norm": 2.6518581076350554, - "learning_rate": 1.9997405957435133e-06, - "loss": 1.2298, - "step": 23 - }, - { - "epoch": 0.03990854292246934, - "grad_norm": 3.143557273823712, - "learning_rate": 1.999699154538053e-06, - "loss": 1.3306, - "step": 24 - }, - { - "epoch": 0.04157139887757223, - "grad_norm": 2.519680050881837, - "learning_rate": 1.999654644251385e-06, - "loss": 1.1641, - "step": 25 - }, - { - "epoch": 0.043234254832675116, - "grad_norm": 2.575750435555698, - "learning_rate": 1.9996070650201564e-06, - "loss": 1.0828, - "step": 26 - }, - { - "epoch": 0.044897110787778007, - "grad_norm": 2.5829804245352403, - "learning_rate": 1.9995564169904354e-06, - "loss": 1.1247, - "step": 27 - }, - { - "epoch": 0.0465599667428809, - "grad_norm": 2.4503279434629874, - "learning_rate": 1.9995027003177116e-06, - "loss": 1.1183, - "step": 28 - }, - { - "epoch": 0.04822282269798379, - "grad_norm": 2.2946133391864487, - "learning_rate": 1.9994459151668956e-06, - "loss": 1.2234, - "step": 29 - }, - { - "epoch": 0.04988567865308668, - "grad_norm": 2.403866063427151, - "learning_rate": 1.9993860617123183e-06, - "loss": 1.1327, - "step": 30 - }, - { - "epoch": 0.05154853460818957, - "grad_norm": 2.271628438557979, - "learning_rate": 1.99932314013773e-06, - "loss": 1.2817, - "step": 31 - }, - { - "epoch": 0.05321139056329245, - "grad_norm": 2.428340751451155, - "learning_rate": 1.9992571506362995e-06, - "loss": 1.1559, - "step": 32 - }, - { - "epoch": 0.05487424651839534, - "grad_norm": 2.2614420815326604, - "learning_rate": 1.999188093410616e-06, - "loss": 1.1504, - "step": 33 - }, - { - "epoch": 0.05653710247349823, - "grad_norm": 2.34799470197157, - "learning_rate": 1.9991159686726847e-06, - "loss": 1.2409, - "step": 34 - }, - { - "epoch": 0.05819995842860112, - "grad_norm": 2.2003715198087193, - "learning_rate": 1.9990407766439296e-06, - "loss": 1.1731, - "step": 35 - }, - { - "epoch": 0.05986281438370401, - "grad_norm": 2.0892124499601645, - "learning_rate": 1.99896251755519e-06, - "loss": 1.1715, - "step": 36 - }, - { - "epoch": 0.061525670338806904, - "grad_norm": 2.2608789864787218, - "learning_rate": 1.998881191646722e-06, - "loss": 1.1871, - "step": 37 - }, - { - "epoch": 0.0631885262939098, - "grad_norm": 2.4451085935787447, - "learning_rate": 1.9987967991681964e-06, - "loss": 1.31, - "step": 38 - }, - { - "epoch": 0.06485138224901268, - "grad_norm": 2.4506760337598776, - "learning_rate": 1.9987093403786983e-06, - "loss": 1.184, - "step": 39 - }, - { - "epoch": 0.06651423820411557, - "grad_norm": 2.5379516806093267, - "learning_rate": 1.9986188155467267e-06, - "loss": 1.1922, - "step": 40 - }, - { - "epoch": 0.06817709415921845, - "grad_norm": 2.35766565204526, - "learning_rate": 1.998525224950194e-06, - "loss": 1.1608, - "step": 41 - }, - { - "epoch": 0.06983995011432134, - "grad_norm": 2.2438854614295933, - "learning_rate": 1.9984285688764225e-06, - "loss": 1.1149, - "step": 42 - }, - { - "epoch": 0.07150280606942423, - "grad_norm": 2.5176050951814792, - "learning_rate": 1.998328847622148e-06, - "loss": 1.194, - "step": 43 - }, - { - "epoch": 0.07316566202452712, - "grad_norm": 3.5070711652473388, - "learning_rate": 1.998226061493514e-06, - "loss": 1.0549, - "step": 44 - }, - { - "epoch": 0.07482851797963001, - "grad_norm": 3.8844422731190065, - "learning_rate": 1.9981202108060757e-06, - "loss": 1.2055, - "step": 45 - }, - { - "epoch": 0.0764913739347329, - "grad_norm": 3.151088308194362, - "learning_rate": 1.9980112958847947e-06, - "loss": 1.3344, - "step": 46 - }, - { - "epoch": 0.0781542298898358, - "grad_norm": 2.8997950166217508, - "learning_rate": 1.9978993170640403e-06, - "loss": 1.2626, - "step": 47 - }, - { - "epoch": 0.07981708584493868, - "grad_norm": 2.7439550744638126, - "learning_rate": 1.9977842746875875e-06, - "loss": 1.2413, - "step": 48 - }, - { - "epoch": 0.08147994180004157, - "grad_norm": 2.252937617615512, - "learning_rate": 1.997666169108618e-06, - "loss": 1.1592, - "step": 49 - }, - { - "epoch": 0.08314279775514447, - "grad_norm": 2.171409977320052, - "learning_rate": 1.9975450006897158e-06, - "loss": 1.2077, - "step": 50 - }, - { - "epoch": 0.08480565371024736, - "grad_norm": 2.384700039420674, - "learning_rate": 1.9974207698028686e-06, - "loss": 1.2758, - "step": 51 - }, - { - "epoch": 0.08646850966535023, - "grad_norm": 2.2038886028156113, - "learning_rate": 1.9972934768294655e-06, - "loss": 1.0498, - "step": 52 - }, - { - "epoch": 0.08813136562045312, - "grad_norm": 2.104326907097657, - "learning_rate": 1.9971631221602976e-06, - "loss": 1.093, - "step": 53 - }, - { - "epoch": 0.08979422157555601, - "grad_norm": 2.174068419901149, - "learning_rate": 1.997029706195553e-06, - "loss": 1.2529, - "step": 54 - }, - { - "epoch": 0.0914570775306589, - "grad_norm": 2.232522077464942, - "learning_rate": 1.9968932293448205e-06, - "loss": 1.093, - "step": 55 - }, - { - "epoch": 0.0931199334857618, - "grad_norm": 3.0592613934800372, - "learning_rate": 1.996753692027084e-06, - "loss": 1.2318, - "step": 56 - }, - { - "epoch": 0.09478278944086468, - "grad_norm": 2.3919925670804103, - "learning_rate": 1.996611094670724e-06, - "loss": 1.1787, - "step": 57 - }, - { - "epoch": 0.09644564539596757, - "grad_norm": 2.839760642367617, - "learning_rate": 1.9964654377135153e-06, - "loss": 1.2441, - "step": 58 - }, - { - "epoch": 0.09810850135107047, - "grad_norm": 2.536098443986401, - "learning_rate": 1.996316721602625e-06, - "loss": 1.2302, - "step": 59 - }, - { - "epoch": 0.09977135730617336, - "grad_norm": 2.2895217740422447, - "learning_rate": 1.9961649467946124e-06, - "loss": 1.2022, - "step": 60 - }, - { - "epoch": 0.10143421326127625, - "grad_norm": 2.1513461743943414, - "learning_rate": 1.996010113755427e-06, - "loss": 1.1063, - "step": 61 - }, - { - "epoch": 0.10309706921637914, - "grad_norm": 2.1179800201775514, - "learning_rate": 1.995852222960407e-06, - "loss": 1.0463, - "step": 62 - }, - { - "epoch": 0.10475992517148203, - "grad_norm": 2.172434098998751, - "learning_rate": 1.9956912748942776e-06, - "loss": 1.1041, - "step": 63 - }, - { - "epoch": 0.1064227811265849, - "grad_norm": 2.254257836851709, - "learning_rate": 1.9955272700511504e-06, - "loss": 1.079, - "step": 64 - }, - { - "epoch": 0.1080856370816878, - "grad_norm": 2.3081819678701234, - "learning_rate": 1.9953602089345213e-06, - "loss": 1.2835, - "step": 65 - }, - { - "epoch": 0.10974849303679068, - "grad_norm": 2.4521650444864047, - "learning_rate": 1.9951900920572684e-06, - "loss": 1.0946, - "step": 66 - }, - { - "epoch": 0.11141134899189357, - "grad_norm": 2.301990359392983, - "learning_rate": 1.9950169199416512e-06, - "loss": 1.1454, - "step": 67 - }, - { - "epoch": 0.11307420494699646, - "grad_norm": 2.159086848434006, - "learning_rate": 1.994840693119309e-06, - "loss": 1.1554, - "step": 68 - }, - { - "epoch": 0.11473706090209936, - "grad_norm": 2.5488828016690834, - "learning_rate": 1.994661412131259e-06, - "loss": 1.1537, - "step": 69 - }, - { - "epoch": 0.11639991685720225, - "grad_norm": 2.171482670853498, - "learning_rate": 1.9944790775278954e-06, - "loss": 1.2015, - "step": 70 - }, - { - "epoch": 0.11806277281230514, - "grad_norm": 2.247147793101678, - "learning_rate": 1.994293689868985e-06, - "loss": 1.1658, - "step": 71 - }, - { - "epoch": 0.11972562876740803, - "grad_norm": 2.120880044321267, - "learning_rate": 1.99410524972367e-06, - "loss": 1.2776, - "step": 72 - }, - { - "epoch": 0.12138848472251092, - "grad_norm": 2.1241719868023434, - "learning_rate": 1.993913757670462e-06, - "loss": 1.1336, - "step": 73 - }, - { - "epoch": 0.12305134067761381, - "grad_norm": 2.326506113842894, - "learning_rate": 1.9937192142972426e-06, - "loss": 1.2217, - "step": 74 - }, - { - "epoch": 0.12471419663271668, - "grad_norm": 2.2329181207257536, - "learning_rate": 1.9935216202012607e-06, - "loss": 1.2696, - "step": 75 - }, - { - "epoch": 0.1263770525878196, - "grad_norm": 2.4936798260412947, - "learning_rate": 1.993320975989131e-06, - "loss": 1.1711, - "step": 76 - }, - { - "epoch": 0.12803990854292246, - "grad_norm": 1.9925320853249602, - "learning_rate": 1.993117282276833e-06, - "loss": 1.1406, - "step": 77 - }, - { - "epoch": 0.12970276449802537, - "grad_norm": 2.2228263048635073, - "learning_rate": 1.992910539689707e-06, - "loss": 1.1856, - "step": 78 - }, - { - "epoch": 0.13136562045312825, - "grad_norm": 2.316736137818131, - "learning_rate": 1.9927007488624534e-06, - "loss": 1.1939, - "step": 79 - }, - { - "epoch": 0.13302847640823115, - "grad_norm": 2.2236401181022227, - "learning_rate": 1.9924879104391306e-06, - "loss": 1.1508, - "step": 80 - }, - { - "epoch": 0.13469133236333403, - "grad_norm": 2.3513673044729053, - "learning_rate": 1.992272025073155e-06, - "loss": 1.1877, - "step": 81 - }, - { - "epoch": 0.1363541883184369, - "grad_norm": 2.1953606776371655, - "learning_rate": 1.9920530934272946e-06, - "loss": 1.0935, - "step": 82 - }, - { - "epoch": 0.1380170442735398, - "grad_norm": 2.1614518656924693, - "learning_rate": 1.9918311161736713e-06, - "loss": 1.0932, - "step": 83 - }, - { - "epoch": 0.13967990022864268, - "grad_norm": 2.2701167674984117, - "learning_rate": 1.9916060939937557e-06, - "loss": 1.2265, - "step": 84 - }, - { - "epoch": 0.1413427561837456, - "grad_norm": 2.186355070972481, - "learning_rate": 1.9913780275783674e-06, - "loss": 1.102, - "step": 85 - }, - { - "epoch": 0.14300561213884846, - "grad_norm": 2.1276524267117267, - "learning_rate": 1.991146917627671e-06, - "loss": 1.2096, - "step": 86 - }, - { - "epoch": 0.14466846809395137, - "grad_norm": 2.2049755692680653, - "learning_rate": 1.9909127648511754e-06, - "loss": 1.1638, - "step": 87 - }, - { - "epoch": 0.14633132404905425, - "grad_norm": 2.193976635251867, - "learning_rate": 1.990675569967731e-06, - "loss": 1.3051, - "step": 88 - }, - { - "epoch": 0.14799418000415715, - "grad_norm": 2.3179428851026205, - "learning_rate": 1.990435333705527e-06, - "loss": 1.1257, - "step": 89 - }, - { - "epoch": 0.14965703595926003, - "grad_norm": 2.9183136856468925, - "learning_rate": 1.9901920568020894e-06, - "loss": 1.0138, - "step": 90 - }, - { - "epoch": 0.15131989191436293, - "grad_norm": 2.0666844143324057, - "learning_rate": 1.9899457400042806e-06, - "loss": 1.0314, - "step": 91 - }, - { - "epoch": 0.1529827478694658, - "grad_norm": 2.146945135936375, - "learning_rate": 1.9896963840682935e-06, - "loss": 1.14, - "step": 92 - }, - { - "epoch": 0.15464560382456868, - "grad_norm": 2.0563546518727427, - "learning_rate": 1.989443989759652e-06, - "loss": 1.1322, - "step": 93 - }, - { - "epoch": 0.1563084597796716, - "grad_norm": 2.0942173553840124, - "learning_rate": 1.989188557853208e-06, - "loss": 1.1109, - "step": 94 - }, - { - "epoch": 0.15797131573477446, - "grad_norm": 2.077080441568994, - "learning_rate": 1.9889300891331387e-06, - "loss": 1.0046, - "step": 95 - }, - { - "epoch": 0.15963417168987737, - "grad_norm": 2.3184144062168803, - "learning_rate": 1.9886685843929446e-06, - "loss": 1.18, - "step": 96 - }, - { - "epoch": 0.16129702764498025, - "grad_norm": 2.116450161143651, - "learning_rate": 1.988404044435446e-06, - "loss": 1.1773, - "step": 97 - }, - { - "epoch": 0.16295988360008315, - "grad_norm": 2.195134284878133, - "learning_rate": 1.988136470072782e-06, - "loss": 1.1737, - "step": 98 - }, - { - "epoch": 0.16462273955518603, - "grad_norm": 2.2738180312076977, - "learning_rate": 1.987865862126408e-06, - "loss": 1.2176, - "step": 99 - }, - { - "epoch": 0.16628559551028893, - "grad_norm": 2.31803637733792, - "learning_rate": 1.98759222142709e-06, - "loss": 1.0916, - "step": 100 - }, - { - "epoch": 0.1679484514653918, - "grad_norm": 2.1944116623564676, - "learning_rate": 1.9873155488149076e-06, - "loss": 1.1836, - "step": 101 - }, - { - "epoch": 0.1696113074204947, - "grad_norm": 2.220799107434225, - "learning_rate": 1.9870358451392464e-06, - "loss": 1.0624, - "step": 102 - }, - { - "epoch": 0.1712741633755976, - "grad_norm": 2.2574992766238373, - "learning_rate": 1.9867531112587987e-06, - "loss": 1.0148, - "step": 103 - }, - { - "epoch": 0.17293701933070046, - "grad_norm": 2.2367520120502524, - "learning_rate": 1.9864673480415585e-06, - "loss": 1.2407, - "step": 104 - }, - { - "epoch": 0.17459987528580337, - "grad_norm": 2.4288409651842477, - "learning_rate": 1.98617855636482e-06, - "loss": 1.0532, - "step": 105 - }, - { - "epoch": 0.17626273124090625, - "grad_norm": 2.422107031056536, - "learning_rate": 1.9858867371151753e-06, - "loss": 1.1696, - "step": 106 - }, - { - "epoch": 0.17792558719600915, - "grad_norm": 2.1987680710210693, - "learning_rate": 1.9855918911885106e-06, - "loss": 1.0976, - "step": 107 - }, - { - "epoch": 0.17958844315111203, - "grad_norm": 2.0312525346517307, - "learning_rate": 1.985294019490005e-06, - "loss": 1.052, - "step": 108 - }, - { - "epoch": 0.18125129910621493, - "grad_norm": 2.2392770140952116, - "learning_rate": 1.9849931229341256e-06, - "loss": 1.0335, - "step": 109 - }, - { - "epoch": 0.1829141550613178, - "grad_norm": 2.4061693671071227, - "learning_rate": 1.984689202444626e-06, - "loss": 1.2724, - "step": 110 - }, - { - "epoch": 0.1845770110164207, - "grad_norm": 2.381783600731093, - "learning_rate": 1.984382258954544e-06, - "loss": 1.0571, - "step": 111 - }, - { - "epoch": 0.1862398669715236, - "grad_norm": 2.3568823162290076, - "learning_rate": 1.984072293406197e-06, - "loss": 1.1028, - "step": 112 - }, - { - "epoch": 0.1879027229266265, - "grad_norm": 3.889820858228295, - "learning_rate": 1.983759306751182e-06, - "loss": 1.2139, - "step": 113 - }, - { - "epoch": 0.18956557888172937, - "grad_norm": 2.2124619744454415, - "learning_rate": 1.9834432999503684e-06, - "loss": 1.0924, - "step": 114 - }, - { - "epoch": 0.19122843483683227, - "grad_norm": 2.3492979702975023, - "learning_rate": 1.9831242739738986e-06, - "loss": 1.2233, - "step": 115 - }, - { - "epoch": 0.19289129079193515, - "grad_norm": 2.6172379375651347, - "learning_rate": 1.982802229801184e-06, - "loss": 1.1797, - "step": 116 - }, - { - "epoch": 0.19455414674703803, - "grad_norm": 2.195119546109998, - "learning_rate": 1.9824771684209024e-06, - "loss": 1.0346, - "step": 117 - }, - { - "epoch": 0.19621700270214093, - "grad_norm": 2.1198330508663, - "learning_rate": 1.982149090830993e-06, - "loss": 1.2163, - "step": 118 - }, - { - "epoch": 0.1978798586572438, - "grad_norm": 2.23691299776538, - "learning_rate": 1.981817998038656e-06, - "loss": 1.1865, - "step": 119 - }, - { - "epoch": 0.1995427146123467, - "grad_norm": 2.088815523084208, - "learning_rate": 1.981483891060348e-06, - "loss": 1.1199, - "step": 120 - }, - { - "epoch": 0.2012055705674496, - "grad_norm": 2.273270712463945, - "learning_rate": 1.9811467709217785e-06, - "loss": 1.2325, - "step": 121 - }, - { - "epoch": 0.2028684265225525, - "grad_norm": 2.18183229638507, - "learning_rate": 1.980806638657908e-06, - "loss": 1.2246, - "step": 122 - }, - { - "epoch": 0.20453128247765537, - "grad_norm": 3.193814855632699, - "learning_rate": 1.980463495312945e-06, - "loss": 1.0395, - "step": 123 - }, - { - "epoch": 0.20619413843275827, - "grad_norm": 2.465032307682835, - "learning_rate": 1.9801173419403404e-06, - "loss": 1.171, - "step": 124 - }, - { - "epoch": 0.20785699438786115, - "grad_norm": 2.2673988159196665, - "learning_rate": 1.979768179602787e-06, - "loss": 1.0496, - "step": 125 - }, - { - "epoch": 0.20951985034296405, - "grad_norm": 2.1447719311727265, - "learning_rate": 1.9794160093722147e-06, - "loss": 1.0066, - "step": 126 - }, - { - "epoch": 0.21118270629806693, - "grad_norm": 2.760076601802052, - "learning_rate": 1.979060832329788e-06, - "loss": 1.1358, - "step": 127 - }, - { - "epoch": 0.2128455622531698, - "grad_norm": 2.2167665381661865, - "learning_rate": 1.978702649565902e-06, - "loss": 1.114, - "step": 128 - }, - { - "epoch": 0.2145084182082727, - "grad_norm": 2.2791559309596594, - "learning_rate": 1.9783414621801796e-06, - "loss": 1.0434, - "step": 129 - }, - { - "epoch": 0.2161712741633756, - "grad_norm": 2.432104553889974, - "learning_rate": 1.9779772712814675e-06, - "loss": 1.2542, - "step": 130 - }, - { - "epoch": 0.2178341301184785, - "grad_norm": 2.7306377502903985, - "learning_rate": 1.9776100779878343e-06, - "loss": 1.0987, - "step": 131 - }, - { - "epoch": 0.21949698607358137, - "grad_norm": 2.7879620153482114, - "learning_rate": 1.977239883426564e-06, - "loss": 1.1479, - "step": 132 - }, - { - "epoch": 0.22115984202868427, - "grad_norm": 2.1514985446900354, - "learning_rate": 1.9768666887341564e-06, - "loss": 1.2583, - "step": 133 - }, - { - "epoch": 0.22282269798378715, - "grad_norm": 2.0415497832604057, - "learning_rate": 1.9764904950563213e-06, - "loss": 1.1969, - "step": 134 - }, - { - "epoch": 0.22448555393889005, - "grad_norm": 2.2681683904958403, - "learning_rate": 1.9761113035479745e-06, - "loss": 1.2112, - "step": 135 - }, - { - "epoch": 0.22614840989399293, - "grad_norm": 2.260381990967556, - "learning_rate": 1.975729115373236e-06, - "loss": 1.1884, - "step": 136 - }, - { - "epoch": 0.22781126584909583, - "grad_norm": 2.384237988040654, - "learning_rate": 1.9753439317054253e-06, - "loss": 0.9922, - "step": 137 - }, - { - "epoch": 0.2294741218041987, - "grad_norm": 2.2694287379916007, - "learning_rate": 1.9749557537270584e-06, - "loss": 1.0884, - "step": 138 - }, - { - "epoch": 0.2311369777593016, - "grad_norm": 2.358144314592439, - "learning_rate": 1.974564582629843e-06, - "loss": 1.1049, - "step": 139 - }, - { - "epoch": 0.2327998337144045, - "grad_norm": 2.1123073465297306, - "learning_rate": 1.9741704196146766e-06, - "loss": 1.0306, - "step": 140 - }, - { - "epoch": 0.23446268966950737, - "grad_norm": 2.1512707106124798, - "learning_rate": 1.973773265891641e-06, - "loss": 1.0528, - "step": 141 - }, - { - "epoch": 0.23612554562461027, - "grad_norm": 2.172534073331255, - "learning_rate": 1.973373122680001e-06, - "loss": 1.1115, - "step": 142 - }, - { - "epoch": 0.23778840157971315, - "grad_norm": 2.3733216420093473, - "learning_rate": 1.972969991208198e-06, - "loss": 1.1323, - "step": 143 - }, - { - "epoch": 0.23945125753481605, - "grad_norm": 2.4625937737389094, - "learning_rate": 1.9725638727138465e-06, - "loss": 1.0889, - "step": 144 - }, - { - "epoch": 0.24111411348991893, - "grad_norm": 2.2420911044484395, - "learning_rate": 1.9721547684437332e-06, - "loss": 1.0748, - "step": 145 - }, - { - "epoch": 0.24277696944502183, - "grad_norm": 2.274748664959236, - "learning_rate": 1.97174267965381e-06, - "loss": 1.2631, - "step": 146 - }, - { - "epoch": 0.2444398254001247, - "grad_norm": 2.251081234626043, - "learning_rate": 1.9713276076091916e-06, - "loss": 1.2066, - "step": 147 - }, - { - "epoch": 0.24610268135522761, - "grad_norm": 2.2539634056834137, - "learning_rate": 1.970909553584151e-06, - "loss": 1.0727, - "step": 148 - }, - { - "epoch": 0.2477655373103305, - "grad_norm": 2.1434067229672094, - "learning_rate": 1.9704885188621157e-06, - "loss": 1.1082, - "step": 149 - }, - { - "epoch": 0.24942839326543337, - "grad_norm": 2.236934300850519, - "learning_rate": 1.970064504735665e-06, - "loss": 1.1063, - "step": 150 - }, - { - "epoch": 0.2510912492205363, - "grad_norm": 2.760825374012749, - "learning_rate": 1.969637512506524e-06, - "loss": 1.2253, - "step": 151 - }, - { - "epoch": 0.2510912492205363, - "eval_loss": 1.4304277896881104, - "eval_runtime": 24.5872, - "eval_samples_per_second": 0.447, - "eval_steps_per_second": 0.122, - "step": 151 - }, - { - "epoch": 0.2527541051756392, - "grad_norm": 2.548532327058704, - "learning_rate": 1.9692075434855604e-06, - "loss": 1.1196, - "step": 152 - }, - { - "epoch": 0.254416961130742, - "grad_norm": 2.3149961972538464, - "learning_rate": 1.9687745989927824e-06, - "loss": 1.093, - "step": 153 - }, - { - "epoch": 0.25607981708584493, - "grad_norm": 2.2766910929781257, - "learning_rate": 1.96833868035733e-06, - "loss": 1.1225, - "step": 154 - }, - { - "epoch": 0.25774267304094783, - "grad_norm": 2.7395763625060674, - "learning_rate": 1.967899788917477e-06, - "loss": 1.1423, - "step": 155 - }, - { - "epoch": 0.25940552899605074, - "grad_norm": 2.389993560718361, - "learning_rate": 1.9674579260206213e-06, - "loss": 1.1823, - "step": 156 - }, - { - "epoch": 0.2610683849511536, - "grad_norm": 2.483941805273633, - "learning_rate": 1.967013093023285e-06, - "loss": 1.2279, - "step": 157 - }, - { - "epoch": 0.2627312409062565, - "grad_norm": 2.2477103547008914, - "learning_rate": 1.9665652912911065e-06, - "loss": 1.1962, - "step": 158 - }, - { - "epoch": 0.2643940968613594, - "grad_norm": 2.841612920075852, - "learning_rate": 1.9661145221988398e-06, - "loss": 1.1292, - "step": 159 - }, - { - "epoch": 0.2660569528164623, - "grad_norm": 2.5188774282226487, - "learning_rate": 1.9656607871303485e-06, - "loss": 1.0174, - "step": 160 - }, - { - "epoch": 0.26771980877156515, - "grad_norm": 2.601863007672941, - "learning_rate": 1.9652040874786007e-06, - "loss": 1.1803, - "step": 161 - }, - { - "epoch": 0.26938266472666805, - "grad_norm": 2.1729305988620027, - "learning_rate": 1.964744424645667e-06, - "loss": 1.0692, - "step": 162 - }, - { - "epoch": 0.27104552068177096, - "grad_norm": 2.4137099619779017, - "learning_rate": 1.964281800042714e-06, - "loss": 1.2412, - "step": 163 - }, - { - "epoch": 0.2727083766368738, - "grad_norm": 2.4257004197876437, - "learning_rate": 1.9638162150900025e-06, - "loss": 1.1754, - "step": 164 - }, - { - "epoch": 0.2743712325919767, - "grad_norm": 2.127187241617996, - "learning_rate": 1.96334767121688e-06, - "loss": 1.1212, - "step": 165 - }, - { - "epoch": 0.2760340885470796, - "grad_norm": 2.5505445228517796, - "learning_rate": 1.962876169861778e-06, - "loss": 1.1873, - "step": 166 - }, - { - "epoch": 0.2776969445021825, - "grad_norm": 2.362761021929743, - "learning_rate": 1.9624017124722085e-06, - "loss": 1.1519, - "step": 167 - }, - { - "epoch": 0.27935980045728537, - "grad_norm": 2.3485374311573874, - "learning_rate": 1.9619243005047574e-06, - "loss": 1.2356, - "step": 168 - }, - { - "epoch": 0.28102265641238827, - "grad_norm": 2.1257685683782066, - "learning_rate": 1.9614439354250824e-06, - "loss": 1.1825, - "step": 169 - }, - { - "epoch": 0.2826855123674912, - "grad_norm": 2.395738763591683, - "learning_rate": 1.960960618707906e-06, - "loss": 1.1009, - "step": 170 - }, - { - "epoch": 0.2843483683225941, - "grad_norm": 2.2654812048452855, - "learning_rate": 1.9604743518370133e-06, - "loss": 1.1115, - "step": 171 - }, - { - "epoch": 0.28601122427769693, - "grad_norm": 2.377019893527625, - "learning_rate": 1.959985136305246e-06, - "loss": 1.1096, - "step": 172 - }, - { - "epoch": 0.28767408023279983, - "grad_norm": 2.2583188742925597, - "learning_rate": 1.9594929736144973e-06, - "loss": 1.0579, - "step": 173 - }, - { - "epoch": 0.28933693618790274, - "grad_norm": 2.2955420850835644, - "learning_rate": 1.9589978652757096e-06, - "loss": 1.0044, - "step": 174 - }, - { - "epoch": 0.2909997921430056, - "grad_norm": 2.135658412712305, - "learning_rate": 1.9584998128088683e-06, - "loss": 0.9674, - "step": 175 - }, - { - "epoch": 0.2926626480981085, - "grad_norm": 2.3814784805947435, - "learning_rate": 1.9579988177429965e-06, - "loss": 0.9844, - "step": 176 - }, - { - "epoch": 0.2943255040532114, - "grad_norm": 4.249522359005645, - "learning_rate": 1.957494881616151e-06, - "loss": 1.0733, - "step": 177 - }, - { - "epoch": 0.2959883600083143, - "grad_norm": 2.1491559170675005, - "learning_rate": 1.956988005975419e-06, - "loss": 1.1137, - "step": 178 - }, - { - "epoch": 0.29765121596341715, - "grad_norm": 2.237150018318634, - "learning_rate": 1.9564781923769105e-06, - "loss": 1.1954, - "step": 179 - }, - { - "epoch": 0.29931407191852005, - "grad_norm": 2.051176936877328, - "learning_rate": 1.955965442385756e-06, - "loss": 1.132, - "step": 180 - }, - { - "epoch": 0.30097692787362296, - "grad_norm": 3.5719215857546045, - "learning_rate": 1.9554497575761e-06, - "loss": 1.2088, - "step": 181 - }, - { - "epoch": 0.30263978382872586, - "grad_norm": 2.0583047844957343, - "learning_rate": 1.9549311395310982e-06, - "loss": 1.0338, - "step": 182 - }, - { - "epoch": 0.3043026397838287, - "grad_norm": 2.356690350916363, - "learning_rate": 1.9544095898429094e-06, - "loss": 1.1253, - "step": 183 - }, - { - "epoch": 0.3059654957389316, - "grad_norm": 2.31830959115815, - "learning_rate": 1.9538851101126944e-06, - "loss": 0.9722, - "step": 184 - }, - { - "epoch": 0.3076283516940345, - "grad_norm": 2.2973614356210503, - "learning_rate": 1.9533577019506085e-06, - "loss": 1.1471, - "step": 185 - }, - { - "epoch": 0.30929120764913737, - "grad_norm": 2.2790981590004233, - "learning_rate": 1.952827366975797e-06, - "loss": 1.0407, - "step": 186 - }, - { - "epoch": 0.31095406360424027, - "grad_norm": 2.2618992906420714, - "learning_rate": 1.952294106816391e-06, - "loss": 1.2548, - "step": 187 - }, - { - "epoch": 0.3126169195593432, - "grad_norm": 2.644954828913663, - "learning_rate": 1.9517579231095015e-06, - "loss": 1.2529, - "step": 188 - }, - { - "epoch": 0.3142797755144461, - "grad_norm": 2.256019290547997, - "learning_rate": 1.9512188175012153e-06, - "loss": 1.1046, - "step": 189 - }, - { - "epoch": 0.31594263146954893, - "grad_norm": 2.5306772178751133, - "learning_rate": 1.950676791646589e-06, - "loss": 1.1444, - "step": 190 - }, - { - "epoch": 0.31760548742465183, - "grad_norm": 2.530764997510064, - "learning_rate": 1.9501318472096447e-06, - "loss": 1.1185, - "step": 191 - }, - { - "epoch": 0.31926834337975474, - "grad_norm": 2.308534558092756, - "learning_rate": 1.9495839858633648e-06, - "loss": 1.1669, - "step": 192 - }, - { - "epoch": 0.32093119933485764, - "grad_norm": 2.2116575569490924, - "learning_rate": 1.9490332092896857e-06, - "loss": 1.1689, - "step": 193 - }, - { - "epoch": 0.3225940552899605, - "grad_norm": 3.098403349489372, - "learning_rate": 1.9484795191794943e-06, - "loss": 1.0652, - "step": 194 - }, - { - "epoch": 0.3242569112450634, - "grad_norm": 2.2040383274861965, - "learning_rate": 1.947922917232622e-06, - "loss": 0.9792, - "step": 195 - }, - { - "epoch": 0.3259197672001663, - "grad_norm": 2.2311156638317557, - "learning_rate": 1.9473634051578394e-06, - "loss": 1.0935, - "step": 196 - }, - { - "epoch": 0.32758262315526915, - "grad_norm": 2.2800635132242997, - "learning_rate": 1.946800984672851e-06, - "loss": 1.2076, - "step": 197 - }, - { - "epoch": 0.32924547911037205, - "grad_norm": 3.654896049770491, - "learning_rate": 1.946235657504291e-06, - "loss": 1.1199, - "step": 198 - }, - { - "epoch": 0.33090833506547496, - "grad_norm": 3.21717248498429, - "learning_rate": 1.945667425387716e-06, - "loss": 1.0107, - "step": 199 - }, - { - "epoch": 0.33257119102057786, - "grad_norm": 2.401723766209091, - "learning_rate": 1.9450962900676014e-06, - "loss": 1.1726, - "step": 200 - }, - { - "epoch": 0.3342340469756807, - "grad_norm": 2.395261717837967, - "learning_rate": 1.9445222532973356e-06, - "loss": 1.1804, - "step": 201 - }, - { - "epoch": 0.3358969029307836, - "grad_norm": 2.1446239585083053, - "learning_rate": 1.943945316839214e-06, - "loss": 1.1057, - "step": 202 - }, - { - "epoch": 0.3375597588858865, - "grad_norm": 2.316344120227303, - "learning_rate": 1.9433654824644343e-06, - "loss": 1.2313, - "step": 203 - }, - { - "epoch": 0.3392226148409894, - "grad_norm": 2.260699038364008, - "learning_rate": 1.9427827519530917e-06, - "loss": 1.0259, - "step": 204 - }, - { - "epoch": 0.34088547079609227, - "grad_norm": 2.2098708691255027, - "learning_rate": 1.94219712709417e-06, - "loss": 1.162, - "step": 205 - }, - { - "epoch": 0.3425483267511952, - "grad_norm": 2.177000943321658, - "learning_rate": 1.9416086096855414e-06, - "loss": 1.1537, - "step": 206 - }, - { - "epoch": 0.3442111827062981, - "grad_norm": 2.1645641329995127, - "learning_rate": 1.9410172015339575e-06, - "loss": 1.0991, - "step": 207 - }, - { - "epoch": 0.34587403866140093, - "grad_norm": 2.352398281001825, - "learning_rate": 1.940422904455043e-06, - "loss": 1.1333, - "step": 208 - }, - { - "epoch": 0.34753689461650383, - "grad_norm": 2.3240858436164404, - "learning_rate": 1.939825720273294e-06, - "loss": 1.1263, - "step": 209 - }, - { - "epoch": 0.34919975057160674, - "grad_norm": 2.2225865374520404, - "learning_rate": 1.939225650822068e-06, - "loss": 1.1092, - "step": 210 - }, - { - "epoch": 0.35086260652670964, - "grad_norm": 2.344261138949479, - "learning_rate": 1.938622697943581e-06, - "loss": 1.1262, - "step": 211 - }, - { - "epoch": 0.3525254624818125, - "grad_norm": 2.0594249160281817, - "learning_rate": 1.938016863488902e-06, - "loss": 1.03, - "step": 212 - }, - { - "epoch": 0.3541883184369154, - "grad_norm": 2.197981511279412, - "learning_rate": 1.9374081493179453e-06, - "loss": 1.2617, - "step": 213 - }, - { - "epoch": 0.3558511743920183, - "grad_norm": 2.3896742142995717, - "learning_rate": 1.9367965572994663e-06, - "loss": 1.1666, - "step": 214 - }, - { - "epoch": 0.3575140303471212, - "grad_norm": 2.2197343813094075, - "learning_rate": 1.9361820893110554e-06, - "loss": 1.1399, - "step": 215 - }, - { - "epoch": 0.35917688630222405, - "grad_norm": 2.0777320432807684, - "learning_rate": 1.9355647472391325e-06, - "loss": 0.997, - "step": 216 - }, - { - "epoch": 0.36083974225732696, - "grad_norm": 2.1390003170764262, - "learning_rate": 1.93494453297894e-06, - "loss": 1.0987, - "step": 217 - }, - { - "epoch": 0.36250259821242986, - "grad_norm": 2.2336666277286485, - "learning_rate": 1.93432144843454e-06, - "loss": 1.1072, - "step": 218 - }, - { - "epoch": 0.36416545416753276, - "grad_norm": 2.5151519435391565, - "learning_rate": 1.933695495518804e-06, - "loss": 1.187, - "step": 219 - }, - { - "epoch": 0.3658283101226356, - "grad_norm": 2.237546644093813, - "learning_rate": 1.9330666761534104e-06, - "loss": 1.1992, - "step": 220 - }, - { - "epoch": 0.3674911660777385, - "grad_norm": 2.20747601132904, - "learning_rate": 1.932434992268838e-06, - "loss": 1.25, - "step": 221 - }, - { - "epoch": 0.3691540220328414, - "grad_norm": 2.0703921340607954, - "learning_rate": 1.9318004458043595e-06, - "loss": 0.9968, - "step": 222 - }, - { - "epoch": 0.37081687798794427, - "grad_norm": 2.6998540968138203, - "learning_rate": 1.9311630387080355e-06, - "loss": 1.1253, - "step": 223 - }, - { - "epoch": 0.3724797339430472, - "grad_norm": 2.2382556329258083, - "learning_rate": 1.9305227729367088e-06, - "loss": 1.033, - "step": 224 - }, - { - "epoch": 0.3741425898981501, - "grad_norm": 2.0744866884439426, - "learning_rate": 1.929879650455998e-06, - "loss": 0.9939, - "step": 225 - }, - { - "epoch": 0.375805445853253, - "grad_norm": 2.1380855568440293, - "learning_rate": 1.929233673240293e-06, - "loss": 1.1454, - "step": 226 - }, - { - "epoch": 0.37746830180835583, - "grad_norm": 2.3748389575645783, - "learning_rate": 1.928584843272746e-06, - "loss": 1.0883, - "step": 227 - }, - { - "epoch": 0.37913115776345874, - "grad_norm": 1.9756985785164076, - "learning_rate": 1.9279331625452694e-06, - "loss": 1.0773, - "step": 228 - }, - { - "epoch": 0.38079401371856164, - "grad_norm": 2.8312536305491416, - "learning_rate": 1.927278633058525e-06, - "loss": 1.1389, - "step": 229 - }, - { - "epoch": 0.38245686967366455, - "grad_norm": 2.130686929883375, - "learning_rate": 1.926621256821922e-06, - "loss": 1.0132, - "step": 230 - }, - { - "epoch": 0.3841197256287674, - "grad_norm": 2.305764167292808, - "learning_rate": 1.9259610358536085e-06, - "loss": 1.1054, - "step": 231 - }, - { - "epoch": 0.3857825815838703, - "grad_norm": 2.3978186298387683, - "learning_rate": 1.9252979721804657e-06, - "loss": 1.0411, - "step": 232 - }, - { - "epoch": 0.3874454375389732, - "grad_norm": 2.195123698256728, - "learning_rate": 1.9246320678381032e-06, - "loss": 1.1931, - "step": 233 - }, - { - "epoch": 0.38910829349407605, - "grad_norm": 2.6254493837547996, - "learning_rate": 1.92396332487085e-06, - "loss": 1.1892, - "step": 234 - }, - { - "epoch": 0.39077114944917896, - "grad_norm": 2.419866448691614, - "learning_rate": 1.9232917453317492e-06, - "loss": 1.0236, - "step": 235 - }, - { - "epoch": 0.39243400540428186, - "grad_norm": 2.1862355398588287, - "learning_rate": 1.9226173312825553e-06, - "loss": 1.0528, - "step": 236 - }, - { - "epoch": 0.39409686135938476, - "grad_norm": 2.3764854199802707, - "learning_rate": 1.921940084793721e-06, - "loss": 1.0602, - "step": 237 - }, - { - "epoch": 0.3957597173144876, - "grad_norm": 2.4103818910002057, - "learning_rate": 1.921260007944397e-06, - "loss": 0.9984, - "step": 238 - }, - { - "epoch": 0.3974225732695905, - "grad_norm": 2.1867489158332396, - "learning_rate": 1.920577102822422e-06, - "loss": 1.1426, - "step": 239 - }, - { - "epoch": 0.3990854292246934, - "grad_norm": 2.5978893922687964, - "learning_rate": 1.9198913715243182e-06, - "loss": 1.1274, - "step": 240 - }, - { - "epoch": 0.4007482851797963, - "grad_norm": 2.339459827714988, - "learning_rate": 1.9192028161552843e-06, - "loss": 1.0478, - "step": 241 - }, - { - "epoch": 0.4024111411348992, - "grad_norm": 2.3678596214542162, - "learning_rate": 1.9185114388291886e-06, - "loss": 1.0155, - "step": 242 - }, - { - "epoch": 0.4040739970900021, - "grad_norm": 2.1203964226087106, - "learning_rate": 1.9178172416685625e-06, - "loss": 1.0326, - "step": 243 - }, - { - "epoch": 0.405736853045105, - "grad_norm": 2.333908782402527, - "learning_rate": 1.9171202268045946e-06, - "loss": 1.0474, - "step": 244 - }, - { - "epoch": 0.40739970900020783, - "grad_norm": 2.9030943393253454, - "learning_rate": 1.9164203963771243e-06, - "loss": 1.1753, - "step": 245 - }, - { - "epoch": 0.40906256495531074, - "grad_norm": 2.4513503490604087, - "learning_rate": 1.915717752534634e-06, - "loss": 1.0477, - "step": 246 - }, - { - "epoch": 0.41072542091041364, - "grad_norm": 2.0267508782326735, - "learning_rate": 1.915012297434243e-06, - "loss": 1.0991, - "step": 247 - }, - { - "epoch": 0.41238827686551655, - "grad_norm": 2.184393317130635, - "learning_rate": 1.9143040332417036e-06, - "loss": 1.2479, - "step": 248 - }, - { - "epoch": 0.4140511328206194, - "grad_norm": 2.026607524191998, - "learning_rate": 1.9135929621313886e-06, - "loss": 1.0745, - "step": 249 - }, - { - "epoch": 0.4157139887757223, - "grad_norm": 2.66566941286477, - "learning_rate": 1.912879086286291e-06, - "loss": 0.9754, - "step": 250 - }, - { - "epoch": 0.4173768447308252, - "grad_norm": 2.1367244864183625, - "learning_rate": 1.9121624078980122e-06, - "loss": 1.1809, - "step": 251 - }, - { - "epoch": 0.4190397006859281, - "grad_norm": 2.38106027954001, - "learning_rate": 1.911442929166758e-06, - "loss": 1.105, - "step": 252 - }, - { - "epoch": 0.42070255664103096, - "grad_norm": 2.157838169653484, - "learning_rate": 1.910720652301333e-06, - "loss": 1.1357, - "step": 253 - }, - { - "epoch": 0.42236541259613386, - "grad_norm": 2.4900405664879135, - "learning_rate": 1.9099955795191292e-06, - "loss": 1.1238, - "step": 254 - }, - { - "epoch": 0.42402826855123676, - "grad_norm": 2.2488268195059717, - "learning_rate": 1.9092677130461245e-06, - "loss": 1.0831, - "step": 255 - }, - { - "epoch": 0.4256911245063396, - "grad_norm": 2.266782137690799, - "learning_rate": 1.9085370551168718e-06, - "loss": 1.0905, - "step": 256 - }, - { - "epoch": 0.4273539804614425, - "grad_norm": 3.7403186028520934, - "learning_rate": 1.9078036079744947e-06, - "loss": 1.101, - "step": 257 - }, - { - "epoch": 0.4290168364165454, - "grad_norm": 2.455666608661609, - "learning_rate": 1.9070673738706796e-06, - "loss": 1.1266, - "step": 258 - }, - { - "epoch": 0.4306796923716483, - "grad_norm": 2.515358176239364, - "learning_rate": 1.9063283550656687e-06, - "loss": 1.1149, - "step": 259 - }, - { - "epoch": 0.4323425483267512, - "grad_norm": 2.210842033312207, - "learning_rate": 1.905586553828253e-06, - "loss": 1.0168, - "step": 260 - }, - { - "epoch": 0.4340054042818541, - "grad_norm": 2.295182467603218, - "learning_rate": 1.9048419724357658e-06, - "loss": 1.173, - "step": 261 - }, - { - "epoch": 0.435668260236957, - "grad_norm": 2.1506193772075886, - "learning_rate": 1.9040946131740762e-06, - "loss": 1.084, - "step": 262 - }, - { - "epoch": 0.4373311161920599, - "grad_norm": 2.0996773232405403, - "learning_rate": 1.9033444783375804e-06, - "loss": 1.1712, - "step": 263 - }, - { - "epoch": 0.43899397214716274, - "grad_norm": 1.9545895155808832, - "learning_rate": 1.9025915702291954e-06, - "loss": 0.9843, - "step": 264 - }, - { - "epoch": 0.44065682810226564, - "grad_norm": 2.2269341512176224, - "learning_rate": 1.9018358911603535e-06, - "loss": 1.1599, - "step": 265 - }, - { - "epoch": 0.44231968405736855, - "grad_norm": 2.1685909801965297, - "learning_rate": 1.901077443450993e-06, - "loss": 1.0454, - "step": 266 - }, - { - "epoch": 0.4439825400124714, - "grad_norm": 2.377103399607552, - "learning_rate": 1.9003162294295513e-06, - "loss": 1.0963, - "step": 267 - }, - { - "epoch": 0.4456453959675743, - "grad_norm": 2.5151408653095904, - "learning_rate": 1.8995522514329601e-06, - "loss": 1.104, - "step": 268 - }, - { - "epoch": 0.4473082519226772, - "grad_norm": 2.094881644352632, - "learning_rate": 1.8987855118066348e-06, - "loss": 1.2278, - "step": 269 - }, - { - "epoch": 0.4489711078777801, - "grad_norm": 2.2344009413987376, - "learning_rate": 1.8980160129044695e-06, - "loss": 1.1049, - "step": 270 - }, - { - "epoch": 0.45063396383288296, - "grad_norm": 2.297839277269806, - "learning_rate": 1.8972437570888307e-06, - "loss": 1.1538, - "step": 271 - }, - { - "epoch": 0.45229681978798586, - "grad_norm": 2.238645887183212, - "learning_rate": 1.8964687467305463e-06, - "loss": 1.1412, - "step": 272 - }, - { - "epoch": 0.45395967574308876, - "grad_norm": 2.145666305518761, - "learning_rate": 1.895690984208902e-06, - "loss": 1.0315, - "step": 273 - }, - { - "epoch": 0.45562253169819167, - "grad_norm": 2.310288742114033, - "learning_rate": 1.894910471911633e-06, - "loss": 1.0277, - "step": 274 - }, - { - "epoch": 0.4572853876532945, - "grad_norm": 2.155602139621284, - "learning_rate": 1.894127212234916e-06, - "loss": 1.0503, - "step": 275 - }, - { - "epoch": 0.4589482436083974, - "grad_norm": 2.6510720645982944, - "learning_rate": 1.8933412075833607e-06, - "loss": 1.0964, - "step": 276 - }, - { - "epoch": 0.4606110995635003, - "grad_norm": 2.077628159647317, - "learning_rate": 1.8925524603700062e-06, - "loss": 1.0037, - "step": 277 - }, - { - "epoch": 0.4622739555186032, - "grad_norm": 2.117391684386515, - "learning_rate": 1.8917609730163103e-06, - "loss": 1.0695, - "step": 278 - }, - { - "epoch": 0.4639368114737061, - "grad_norm": 2.372444846141058, - "learning_rate": 1.8909667479521425e-06, - "loss": 1.1472, - "step": 279 - }, - { - "epoch": 0.465599667428809, - "grad_norm": 2.4078311689148064, - "learning_rate": 1.8901697876157776e-06, - "loss": 1.2753, - "step": 280 - }, - { - "epoch": 0.4672625233839119, - "grad_norm": 3.21629533072984, - "learning_rate": 1.8893700944538881e-06, - "loss": 1.1071, - "step": 281 - }, - { - "epoch": 0.46892537933901474, - "grad_norm": 2.4295013417380287, - "learning_rate": 1.8885676709215353e-06, - "loss": 1.1355, - "step": 282 - }, - { - "epoch": 0.47058823529411764, - "grad_norm": 2.275602301735013, - "learning_rate": 1.8877625194821636e-06, - "loss": 1.2554, - "step": 283 - }, - { - "epoch": 0.47225109124922054, - "grad_norm": 2.3766077332166957, - "learning_rate": 1.8869546426075917e-06, - "loss": 1.0717, - "step": 284 - }, - { - "epoch": 0.47391394720432345, - "grad_norm": 2.132945677416556, - "learning_rate": 1.8861440427780058e-06, - "loss": 1.1736, - "step": 285 - }, - { - "epoch": 0.4755768031594263, - "grad_norm": 2.045707307029965, - "learning_rate": 1.8853307224819506e-06, - "loss": 0.9087, - "step": 286 - }, - { - "epoch": 0.4772396591145292, - "grad_norm": 2.163246456875578, - "learning_rate": 1.8845146842163238e-06, - "loss": 1.1869, - "step": 287 - }, - { - "epoch": 0.4789025150696321, - "grad_norm": 2.230799208427525, - "learning_rate": 1.8836959304863669e-06, - "loss": 1.0081, - "step": 288 - }, - { - "epoch": 0.48056537102473496, - "grad_norm": 2.430756467607878, - "learning_rate": 1.8828744638056573e-06, - "loss": 1.1908, - "step": 289 - }, - { - "epoch": 0.48222822697983786, - "grad_norm": 2.2702841588026117, - "learning_rate": 1.882050286696102e-06, - "loss": 1.1872, - "step": 290 - }, - { - "epoch": 0.48389108293494076, - "grad_norm": 2.214741995886969, - "learning_rate": 1.881223401687929e-06, - "loss": 1.101, - "step": 291 - }, - { - "epoch": 0.48555393889004367, - "grad_norm": 2.1908978532802, - "learning_rate": 1.8803938113196784e-06, - "loss": 1.0547, - "step": 292 - }, - { - "epoch": 0.4872167948451465, - "grad_norm": 2.1957825231056844, - "learning_rate": 1.8795615181381974e-06, - "loss": 1.0209, - "step": 293 - }, - { - "epoch": 0.4888796508002494, - "grad_norm": 2.4659476575942056, - "learning_rate": 1.8787265246986298e-06, - "loss": 1.0354, - "step": 294 - }, - { - "epoch": 0.4905425067553523, - "grad_norm": 2.499181314884686, - "learning_rate": 1.877888833564409e-06, - "loss": 1.1167, - "step": 295 - }, - { - "epoch": 0.49220536271045523, - "grad_norm": 2.297165687585788, - "learning_rate": 1.8770484473072517e-06, - "loss": 1.2812, - "step": 296 - }, - { - "epoch": 0.4938682186655581, - "grad_norm": 2.155148132032728, - "learning_rate": 1.8762053685071471e-06, - "loss": 0.9955, - "step": 297 - }, - { - "epoch": 0.495531074620661, - "grad_norm": 2.2105051332330183, - "learning_rate": 1.8753595997523513e-06, - "loss": 1.0817, - "step": 298 - }, - { - "epoch": 0.4971939305757639, - "grad_norm": 2.2489919353382937, - "learning_rate": 1.8745111436393785e-06, - "loss": 1.116, - "step": 299 - }, - { - "epoch": 0.49885678653086674, - "grad_norm": 2.245142666798669, - "learning_rate": 1.8736600027729933e-06, - "loss": 1.2594, - "step": 300 - }, - { - "epoch": 0.5005196424859697, - "grad_norm": 2.4738621609715263, - "learning_rate": 1.8728061797662016e-06, - "loss": 1.2273, - "step": 301 - }, - { - "epoch": 0.5021824984410725, - "grad_norm": 2.1040097093458745, - "learning_rate": 1.8719496772402447e-06, - "loss": 1.1395, - "step": 302 - }, - { - "epoch": 0.5021824984410725, - "eval_loss": 1.4084999561309814, - "eval_runtime": 24.7725, - "eval_samples_per_second": 0.444, - "eval_steps_per_second": 0.121, - "step": 302 - }, - { - "epoch": 0.5038453543961754, - "grad_norm": 2.4241733699272565, - "learning_rate": 1.8710904978245894e-06, - "loss": 1.0195, - "step": 303 - }, - { - "epoch": 0.5055082103512784, - "grad_norm": 2.0299543366481254, - "learning_rate": 1.8702286441569203e-06, - "loss": 0.9248, - "step": 304 - }, - { - "epoch": 0.5071710663063812, - "grad_norm": 2.0464859041290984, - "learning_rate": 1.8693641188831328e-06, - "loss": 1.0278, - "step": 305 - }, - { - "epoch": 0.508833922261484, - "grad_norm": 2.2925599147135043, - "learning_rate": 1.8684969246573232e-06, - "loss": 1.1409, - "step": 306 - }, - { - "epoch": 0.510496778216587, - "grad_norm": 2.4406740096357598, - "learning_rate": 1.8676270641417821e-06, - "loss": 1.139, - "step": 307 - }, - { - "epoch": 0.5121596341716899, - "grad_norm": 2.2502338928571692, - "learning_rate": 1.8667545400069856e-06, - "loss": 1.2704, - "step": 308 - }, - { - "epoch": 0.5138224901267928, - "grad_norm": 2.6218475196698274, - "learning_rate": 1.8658793549315868e-06, - "loss": 1.1125, - "step": 309 - }, - { - "epoch": 0.5154853460818957, - "grad_norm": 2.129495598032236, - "learning_rate": 1.8650015116024077e-06, - "loss": 1.0833, - "step": 310 - }, - { - "epoch": 0.5171482020369985, - "grad_norm": 4.815386485319442, - "learning_rate": 1.8641210127144328e-06, - "loss": 1.0958, - "step": 311 - }, - { - "epoch": 0.5188110579921015, - "grad_norm": 2.1912355764465383, - "learning_rate": 1.8632378609707967e-06, - "loss": 1.1466, - "step": 312 - }, - { - "epoch": 0.5204739139472043, - "grad_norm": 2.6871843307281504, - "learning_rate": 1.8623520590827797e-06, - "loss": 1.0443, - "step": 313 - }, - { - "epoch": 0.5221367699023072, - "grad_norm": 2.229171454549482, - "learning_rate": 1.8614636097697983e-06, - "loss": 1.2176, - "step": 314 - }, - { - "epoch": 0.5237996258574101, - "grad_norm": 2.3499709800000343, - "learning_rate": 1.8605725157593957e-06, - "loss": 1.0718, - "step": 315 - }, - { - "epoch": 0.525462481812513, - "grad_norm": 2.128381081935293, - "learning_rate": 1.8596787797872353e-06, - "loss": 1.0168, - "step": 316 - }, - { - "epoch": 0.5271253377676158, - "grad_norm": 2.4014872942261847, - "learning_rate": 1.85878240459709e-06, - "loss": 1.1539, - "step": 317 - }, - { - "epoch": 0.5287881937227188, - "grad_norm": 2.265996337760467, - "learning_rate": 1.857883392940837e-06, - "loss": 1.1284, - "step": 318 - }, - { - "epoch": 0.5304510496778216, - "grad_norm": 2.1606204777164826, - "learning_rate": 1.8569817475784457e-06, - "loss": 1.1039, - "step": 319 - }, - { - "epoch": 0.5321139056329246, - "grad_norm": 2.314221527250599, - "learning_rate": 1.8560774712779719e-06, - "loss": 1.1662, - "step": 320 - }, - { - "epoch": 0.5337767615880274, - "grad_norm": 2.2036586985620064, - "learning_rate": 1.8551705668155479e-06, - "loss": 1.0452, - "step": 321 - }, - { - "epoch": 0.5354396175431303, - "grad_norm": 2.749989537581693, - "learning_rate": 1.8542610369753753e-06, - "loss": 1.1471, - "step": 322 - }, - { - "epoch": 0.5371024734982333, - "grad_norm": 2.368917726575034, - "learning_rate": 1.8533488845497146e-06, - "loss": 1.3107, - "step": 323 - }, - { - "epoch": 0.5387653294533361, - "grad_norm": 2.1977699456063298, - "learning_rate": 1.8524341123388787e-06, - "loss": 0.9482, - "step": 324 - }, - { - "epoch": 0.540428185408439, - "grad_norm": 2.3335155676558625, - "learning_rate": 1.851516723151222e-06, - "loss": 1.0011, - "step": 325 - }, - { - "epoch": 0.5420910413635419, - "grad_norm": 2.2010547018844857, - "learning_rate": 1.850596719803134e-06, - "loss": 1.2211, - "step": 326 - }, - { - "epoch": 0.5437538973186448, - "grad_norm": 2.077916314192133, - "learning_rate": 1.8496741051190298e-06, - "loss": 1.1015, - "step": 327 - }, - { - "epoch": 0.5454167532737476, - "grad_norm": 2.164449806731655, - "learning_rate": 1.84874888193134e-06, - "loss": 0.9801, - "step": 328 - }, - { - "epoch": 0.5470796092288506, - "grad_norm": 2.098902846890324, - "learning_rate": 1.847821053080505e-06, - "loss": 0.9848, - "step": 329 - }, - { - "epoch": 0.5487424651839534, - "grad_norm": 2.3723967966499253, - "learning_rate": 1.8468906214149636e-06, - "loss": 1.1727, - "step": 330 - }, - { - "epoch": 0.5504053211390564, - "grad_norm": 2.28191518531219, - "learning_rate": 1.8459575897911453e-06, - "loss": 1.2235, - "step": 331 - }, - { - "epoch": 0.5520681770941592, - "grad_norm": 2.2820229639301406, - "learning_rate": 1.845021961073462e-06, - "loss": 1.2018, - "step": 332 - }, - { - "epoch": 0.5537310330492621, - "grad_norm": 2.138213918755252, - "learning_rate": 1.844083738134298e-06, - "loss": 1.2856, - "step": 333 - }, - { - "epoch": 0.555393889004365, - "grad_norm": 2.158628371398021, - "learning_rate": 1.8431429238540027e-06, - "loss": 1.0644, - "step": 334 - }, - { - "epoch": 0.5570567449594679, - "grad_norm": 2.3156623015320377, - "learning_rate": 1.84219952112088e-06, - "loss": 1.0024, - "step": 335 - }, - { - "epoch": 0.5587196009145707, - "grad_norm": 2.1704636280326923, - "learning_rate": 1.8412535328311812e-06, - "loss": 1.1603, - "step": 336 - }, - { - "epoch": 0.5603824568696737, - "grad_norm": 2.324581658968828, - "learning_rate": 1.8403049618890948e-06, - "loss": 1.2107, - "step": 337 - }, - { - "epoch": 0.5620453128247765, - "grad_norm": 2.1819124113613504, - "learning_rate": 1.8393538112067376e-06, - "loss": 1.0166, - "step": 338 - }, - { - "epoch": 0.5637081687798794, - "grad_norm": 2.14128534614559, - "learning_rate": 1.8384000837041476e-06, - "loss": 1.0348, - "step": 339 - }, - { - "epoch": 0.5653710247349824, - "grad_norm": 2.2165793831097034, - "learning_rate": 1.8374437823092722e-06, - "loss": 1.1544, - "step": 340 - }, - { - "epoch": 0.5670338806900852, - "grad_norm": 2.0867661916904456, - "learning_rate": 1.8364849099579618e-06, - "loss": 1.074, - "step": 341 - }, - { - "epoch": 0.5686967366451882, - "grad_norm": 2.278047999899843, - "learning_rate": 1.8355234695939585e-06, - "loss": 1.1037, - "step": 342 - }, - { - "epoch": 0.570359592600291, - "grad_norm": 2.15233238553855, - "learning_rate": 1.8345594641688894e-06, - "loss": 1.0763, - "step": 343 - }, - { - "epoch": 0.5720224485553939, - "grad_norm": 2.1861191898315067, - "learning_rate": 1.8335928966422553e-06, - "loss": 1.273, - "step": 344 - }, - { - "epoch": 0.5736853045104968, - "grad_norm": 2.191986681142867, - "learning_rate": 1.8326237699814238e-06, - "loss": 1.1571, - "step": 345 - }, - { - "epoch": 0.5753481604655997, - "grad_norm": 2.2910127446336737, - "learning_rate": 1.8316520871616178e-06, - "loss": 1.0243, - "step": 346 - }, - { - "epoch": 0.5770110164207025, - "grad_norm": 2.088235032433896, - "learning_rate": 1.8306778511659085e-06, - "loss": 1.0289, - "step": 347 - }, - { - "epoch": 0.5786738723758055, - "grad_norm": 2.213333845589627, - "learning_rate": 1.829701064985205e-06, - "loss": 1.1229, - "step": 348 - }, - { - "epoch": 0.5803367283309083, - "grad_norm": 2.3961880269672626, - "learning_rate": 1.8287217316182457e-06, - "loss": 0.9937, - "step": 349 - }, - { - "epoch": 0.5819995842860112, - "grad_norm": 2.052444834442602, - "learning_rate": 1.8277398540715887e-06, - "loss": 1.1965, - "step": 350 - }, - { - "epoch": 0.5836624402411141, - "grad_norm": 2.410438103487435, - "learning_rate": 1.8267554353596024e-06, - "loss": 1.1488, - "step": 351 - }, - { - "epoch": 0.585325296196217, - "grad_norm": 2.3043831544316835, - "learning_rate": 1.8257684785044576e-06, - "loss": 0.9968, - "step": 352 - }, - { - "epoch": 0.5869881521513199, - "grad_norm": 2.116368023415211, - "learning_rate": 1.8247789865361163e-06, - "loss": 1.0944, - "step": 353 - }, - { - "epoch": 0.5886510081064228, - "grad_norm": 2.0486087552163146, - "learning_rate": 1.8237869624923234e-06, - "loss": 1.127, - "step": 354 - }, - { - "epoch": 0.5903138640615256, - "grad_norm": 2.2869039816178036, - "learning_rate": 1.8227924094185978e-06, - "loss": 1.0497, - "step": 355 - }, - { - "epoch": 0.5919767200166286, - "grad_norm": 2.1915186885351448, - "learning_rate": 1.821795330368222e-06, - "loss": 1.1584, - "step": 356 - }, - { - "epoch": 0.5936395759717314, - "grad_norm": 2.4441273557487566, - "learning_rate": 1.8207957284022337e-06, - "loss": 1.1206, - "step": 357 - }, - { - "epoch": 0.5953024319268343, - "grad_norm": 2.3161936585249334, - "learning_rate": 1.8197936065894155e-06, - "loss": 1.119, - "step": 358 - }, - { - "epoch": 0.5969652878819373, - "grad_norm": 2.303219346593548, - "learning_rate": 1.8187889680062863e-06, - "loss": 1.2847, - "step": 359 - }, - { - "epoch": 0.5986281438370401, - "grad_norm": 2.1218629296829152, - "learning_rate": 1.8177818157370912e-06, - "loss": 1.0495, - "step": 360 - }, - { - "epoch": 0.600290999792143, - "grad_norm": 2.055562774852349, - "learning_rate": 1.816772152873793e-06, - "loss": 1.1212, - "step": 361 - }, - { - "epoch": 0.6019538557472459, - "grad_norm": 2.1036164422163695, - "learning_rate": 1.8157599825160607e-06, - "loss": 0.9293, - "step": 362 - }, - { - "epoch": 0.6036167117023488, - "grad_norm": 2.2429881312173596, - "learning_rate": 1.8147453077712634e-06, - "loss": 1.134, - "step": 363 - }, - { - "epoch": 0.6052795676574517, - "grad_norm": 2.147826876347206, - "learning_rate": 1.813728131754456e-06, - "loss": 1.0184, - "step": 364 - }, - { - "epoch": 0.6069424236125546, - "grad_norm": 2.221807143035665, - "learning_rate": 1.8127084575883748e-06, - "loss": 1.1232, - "step": 365 - }, - { - "epoch": 0.6086052795676574, - "grad_norm": 2.608531400881515, - "learning_rate": 1.811686288403424e-06, - "loss": 1.166, - "step": 366 - }, - { - "epoch": 0.6102681355227604, - "grad_norm": 2.3488837184353937, - "learning_rate": 1.8106616273376681e-06, - "loss": 1.0413, - "step": 367 - }, - { - "epoch": 0.6119309914778632, - "grad_norm": 2.2777707823295668, - "learning_rate": 1.8096344775368211e-06, - "loss": 1.0941, - "step": 368 - }, - { - "epoch": 0.6135938474329661, - "grad_norm": 2.2605130570731253, - "learning_rate": 1.808604842154238e-06, - "loss": 1.0537, - "step": 369 - }, - { - "epoch": 0.615256703388069, - "grad_norm": 2.292940209107732, - "learning_rate": 1.807572724350905e-06, - "loss": 1.0708, - "step": 370 - }, - { - "epoch": 0.6169195593431719, - "grad_norm": 2.10745264157216, - "learning_rate": 1.8065381272954276e-06, - "loss": 0.9477, - "step": 371 - }, - { - "epoch": 0.6185824152982747, - "grad_norm": 2.1624162140056176, - "learning_rate": 1.8055010541640243e-06, - "loss": 1.0009, - "step": 372 - }, - { - "epoch": 0.6202452712533777, - "grad_norm": 2.4110174672424347, - "learning_rate": 1.8044615081405151e-06, - "loss": 1.082, - "step": 373 - }, - { - "epoch": 0.6219081272084805, - "grad_norm": 2.3314352780690166, - "learning_rate": 1.8034194924163103e-06, - "loss": 1.2243, - "step": 374 - }, - { - "epoch": 0.6235709831635835, - "grad_norm": 2.2871677222090137, - "learning_rate": 1.802375010190404e-06, - "loss": 0.9866, - "step": 375 - }, - { - "epoch": 0.6252338391186864, - "grad_norm": 2.3308466745130465, - "learning_rate": 1.8013280646693612e-06, - "loss": 1.0981, - "step": 376 - }, - { - "epoch": 0.6268966950737892, - "grad_norm": 2.211137881920983, - "learning_rate": 1.8002786590673096e-06, - "loss": 1.0953, - "step": 377 - }, - { - "epoch": 0.6285595510288922, - "grad_norm": 2.2653592375727727, - "learning_rate": 1.7992267966059298e-06, - "loss": 1.0843, - "step": 378 - }, - { - "epoch": 0.630222406983995, - "grad_norm": 2.0429297450824024, - "learning_rate": 1.7981724805144443e-06, - "loss": 1.0853, - "step": 379 - }, - { - "epoch": 0.6318852629390979, - "grad_norm": 2.2106979108018794, - "learning_rate": 1.7971157140296088e-06, - "loss": 1.0946, - "step": 380 - }, - { - "epoch": 0.6335481188942008, - "grad_norm": 2.2495780703298025, - "learning_rate": 1.7960565003957016e-06, - "loss": 1.1758, - "step": 381 - }, - { - "epoch": 0.6352109748493037, - "grad_norm": 2.442736100176031, - "learning_rate": 1.7949948428645133e-06, - "loss": 1.0848, - "step": 382 - }, - { - "epoch": 0.6368738308044065, - "grad_norm": 2.1142990741185286, - "learning_rate": 1.7939307446953378e-06, - "loss": 1.0653, - "step": 383 - }, - { - "epoch": 0.6385366867595095, - "grad_norm": 2.0910930966792782, - "learning_rate": 1.7928642091549612e-06, - "loss": 0.9725, - "step": 384 - }, - { - "epoch": 0.6401995427146123, - "grad_norm": 2.278583439400143, - "learning_rate": 1.7917952395176535e-06, - "loss": 1.0259, - "step": 385 - }, - { - "epoch": 0.6418623986697153, - "grad_norm": 2.545286223573369, - "learning_rate": 1.790723839065156e-06, - "loss": 1.1821, - "step": 386 - }, - { - "epoch": 0.6435252546248181, - "grad_norm": 2.261488639334733, - "learning_rate": 1.7896500110866737e-06, - "loss": 0.939, - "step": 387 - }, - { - "epoch": 0.645188110579921, - "grad_norm": 2.7678067419857597, - "learning_rate": 1.7885737588788632e-06, - "loss": 1.0408, - "step": 388 - }, - { - "epoch": 0.6468509665350239, - "grad_norm": 2.11635682407254, - "learning_rate": 1.787495085745824e-06, - "loss": 1.1696, - "step": 389 - }, - { - "epoch": 0.6485138224901268, - "grad_norm": 2.3192534757366703, - "learning_rate": 1.7864139949990882e-06, - "loss": 0.9589, - "step": 390 - }, - { - "epoch": 0.6501766784452296, - "grad_norm": 2.150754996425321, - "learning_rate": 1.7853304899576091e-06, - "loss": 1.1067, - "step": 391 - }, - { - "epoch": 0.6518395344003326, - "grad_norm": 2.2383513192031574, - "learning_rate": 1.784244573947753e-06, - "loss": 1.3417, - "step": 392 - }, - { - "epoch": 0.6535023903554354, - "grad_norm": 2.290710629399966, - "learning_rate": 1.7831562503032865e-06, - "loss": 1.1442, - "step": 393 - }, - { - "epoch": 0.6551652463105383, - "grad_norm": 3.3410043284621143, - "learning_rate": 1.7820655223653689e-06, - "loss": 1.2796, - "step": 394 - }, - { - "epoch": 0.6568281022656413, - "grad_norm": 2.1116250875932727, - "learning_rate": 1.7809723934825402e-06, - "loss": 1.0086, - "step": 395 - }, - { - "epoch": 0.6584909582207441, - "grad_norm": 2.242523763487976, - "learning_rate": 1.7798768670107113e-06, - "loss": 1.0907, - "step": 396 - }, - { - "epoch": 0.6601538141758471, - "grad_norm": 2.151880661955345, - "learning_rate": 1.7787789463131535e-06, - "loss": 0.9459, - "step": 397 - }, - { - "epoch": 0.6618166701309499, - "grad_norm": 2.1313600148722047, - "learning_rate": 1.777678634760489e-06, - "loss": 1.1144, - "step": 398 - }, - { - "epoch": 0.6634795260860528, - "grad_norm": 2.445945952206475, - "learning_rate": 1.7765759357306793e-06, - "loss": 1.1141, - "step": 399 - }, - { - "epoch": 0.6651423820411557, - "grad_norm": 2.1049458345635417, - "learning_rate": 1.7754708526090155e-06, - "loss": 0.9842, - "step": 400 - }, - { - "epoch": 0.6668052379962586, - "grad_norm": 2.4326727008731135, - "learning_rate": 1.7743633887881088e-06, - "loss": 1.1459, - "step": 401 - }, - { - "epoch": 0.6684680939513614, - "grad_norm": 2.2415440353250022, - "learning_rate": 1.7732535476678776e-06, - "loss": 1.0153, - "step": 402 - }, - { - "epoch": 0.6701309499064644, - "grad_norm": 2.6676880307885034, - "learning_rate": 1.77214133265554e-06, - "loss": 1.216, - "step": 403 - }, - { - "epoch": 0.6717938058615672, - "grad_norm": 2.259873210911179, - "learning_rate": 1.7710267471656013e-06, - "loss": 1.1607, - "step": 404 - }, - { - "epoch": 0.6734566618166701, - "grad_norm": 2.1060724600194627, - "learning_rate": 1.7699097946198443e-06, - "loss": 1.0631, - "step": 405 - }, - { - "epoch": 0.675119517771773, - "grad_norm": 2.1821450546259067, - "learning_rate": 1.7687904784473186e-06, - "loss": 1.0213, - "step": 406 - }, - { - "epoch": 0.6767823737268759, - "grad_norm": 2.043314834472435, - "learning_rate": 1.7676688020843305e-06, - "loss": 1.1173, - "step": 407 - }, - { - "epoch": 0.6784452296819788, - "grad_norm": 2.342119646043276, - "learning_rate": 1.7665447689744317e-06, - "loss": 1.0894, - "step": 408 - }, - { - "epoch": 0.6801080856370817, - "grad_norm": 2.0782278008731687, - "learning_rate": 1.7654183825684091e-06, - "loss": 1.1245, - "step": 409 - }, - { - "epoch": 0.6817709415921845, - "grad_norm": 2.0556118504588388, - "learning_rate": 1.7642896463242744e-06, - "loss": 1.0274, - "step": 410 - }, - { - "epoch": 0.6834337975472875, - "grad_norm": 2.4517557799237957, - "learning_rate": 1.7631585637072535e-06, - "loss": 1.1762, - "step": 411 - }, - { - "epoch": 0.6850966535023904, - "grad_norm": 2.2629874321423635, - "learning_rate": 1.7620251381897751e-06, - "loss": 0.9806, - "step": 412 - }, - { - "epoch": 0.6867595094574932, - "grad_norm": 2.33240157213656, - "learning_rate": 1.7608893732514615e-06, - "loss": 1.156, - "step": 413 - }, - { - "epoch": 0.6884223654125962, - "grad_norm": 2.179297842815191, - "learning_rate": 1.7597512723791162e-06, - "loss": 1.0117, - "step": 414 - }, - { - "epoch": 0.690085221367699, - "grad_norm": 2.1949726299406045, - "learning_rate": 1.7586108390667142e-06, - "loss": 1.0476, - "step": 415 - }, - { - "epoch": 0.6917480773228019, - "grad_norm": 2.3336572022760884, - "learning_rate": 1.7574680768153915e-06, - "loss": 1.1267, - "step": 416 - }, - { - "epoch": 0.6934109332779048, - "grad_norm": 2.3983506966023542, - "learning_rate": 1.7563229891334336e-06, - "loss": 1.034, - "step": 417 - }, - { - "epoch": 0.6950737892330077, - "grad_norm": 2.0848820780573476, - "learning_rate": 1.7551755795362654e-06, - "loss": 0.9663, - "step": 418 - }, - { - "epoch": 0.6967366451881106, - "grad_norm": 2.2371795871187063, - "learning_rate": 1.7540258515464395e-06, - "loss": 1.1261, - "step": 419 - }, - { - "epoch": 0.6983995011432135, - "grad_norm": 2.1100352921599437, - "learning_rate": 1.7528738086936269e-06, - "loss": 1.0711, - "step": 420 - }, - { - "epoch": 0.7000623570983163, - "grad_norm": 2.147207122561456, - "learning_rate": 1.7517194545146036e-06, - "loss": 1.0874, - "step": 421 - }, - { - "epoch": 0.7017252130534193, - "grad_norm": 2.1810803049772645, - "learning_rate": 1.750562792553244e-06, - "loss": 1.0132, - "step": 422 - }, - { - "epoch": 0.7033880690085221, - "grad_norm": 2.155938991100877, - "learning_rate": 1.7494038263605049e-06, - "loss": 1.121, - "step": 423 - }, - { - "epoch": 0.705050924963625, - "grad_norm": 2.368347963131549, - "learning_rate": 1.7482425594944182e-06, - "loss": 1.1172, - "step": 424 - }, - { - "epoch": 0.7067137809187279, - "grad_norm": 2.2271099712267013, - "learning_rate": 1.7470789955200786e-06, - "loss": 1.0263, - "step": 425 - }, - { - "epoch": 0.7083766368738308, - "grad_norm": 2.229936898551502, - "learning_rate": 1.7459131380096336e-06, - "loss": 1.0274, - "step": 426 - }, - { - "epoch": 0.7100394928289336, - "grad_norm": 2.158588860368013, - "learning_rate": 1.744744990542271e-06, - "loss": 1.1544, - "step": 427 - }, - { - "epoch": 0.7117023487840366, - "grad_norm": 2.161831442303464, - "learning_rate": 1.7435745567042094e-06, - "loss": 1.0847, - "step": 428 - }, - { - "epoch": 0.7133652047391394, - "grad_norm": 2.169173816530374, - "learning_rate": 1.7424018400886858e-06, - "loss": 1.042, - "step": 429 - }, - { - "epoch": 0.7150280606942424, - "grad_norm": 2.2018091926987964, - "learning_rate": 1.7412268442959465e-06, - "loss": 1.0173, - "step": 430 - }, - { - "epoch": 0.7166909166493453, - "grad_norm": 2.2558519988159045, - "learning_rate": 1.7400495729332337e-06, - "loss": 1.2228, - "step": 431 - }, - { - "epoch": 0.7183537726044481, - "grad_norm": 2.286849860794016, - "learning_rate": 1.7388700296147763e-06, - "loss": 1.1118, - "step": 432 - }, - { - "epoch": 0.7200166285595511, - "grad_norm": 2.376710954298681, - "learning_rate": 1.737688217961778e-06, - "loss": 1.221, - "step": 433 - }, - { - "epoch": 0.7216794845146539, - "grad_norm": 2.165270182256038, - "learning_rate": 1.7365041416024063e-06, - "loss": 1.0928, - "step": 434 - }, - { - "epoch": 0.7233423404697568, - "grad_norm": 2.039272996802367, - "learning_rate": 1.7353178041717814e-06, - "loss": 1.0993, - "step": 435 - }, - { - "epoch": 0.7250051964248597, - "grad_norm": 3.179799411220361, - "learning_rate": 1.7341292093119648e-06, - "loss": 1.1437, - "step": 436 - }, - { - "epoch": 0.7266680523799626, - "grad_norm": 2.438107959779604, - "learning_rate": 1.732938360671948e-06, - "loss": 1.0467, - "step": 437 - }, - { - "epoch": 0.7283309083350655, - "grad_norm": 2.227901031363515, - "learning_rate": 1.7317452619076426e-06, - "loss": 1.2184, - "step": 438 - }, - { - "epoch": 0.7299937642901684, - "grad_norm": 2.484925764202615, - "learning_rate": 1.7305499166818679e-06, - "loss": 1.0869, - "step": 439 - }, - { - "epoch": 0.7316566202452712, - "grad_norm": 2.328525821533016, - "learning_rate": 1.7293523286643383e-06, - "loss": 1.1372, - "step": 440 - }, - { - "epoch": 0.7333194762003742, - "grad_norm": 2.3975391729646014, - "learning_rate": 1.7281525015316559e-06, - "loss": 1.164, - "step": 441 - }, - { - "epoch": 0.734982332155477, - "grad_norm": 2.045522380294782, - "learning_rate": 1.726950438967295e-06, - "loss": 0.9614, - "step": 442 - }, - { - "epoch": 0.7366451881105799, - "grad_norm": 2.233756657460333, - "learning_rate": 1.7257461446615939e-06, - "loss": 1.2181, - "step": 443 - }, - { - "epoch": 0.7383080440656828, - "grad_norm": 2.2137748173388694, - "learning_rate": 1.724539622311742e-06, - "loss": 1.143, - "step": 444 - }, - { - "epoch": 0.7399709000207857, - "grad_norm": 2.384585810876256, - "learning_rate": 1.723330875621768e-06, - "loss": 1.0785, - "step": 445 - }, - { - "epoch": 0.7416337559758885, - "grad_norm": 2.3185815551148585, - "learning_rate": 1.7221199083025305e-06, - "loss": 1.0606, - "step": 446 - }, - { - "epoch": 0.7432966119309915, - "grad_norm": 2.1918576903211053, - "learning_rate": 1.7209067240717055e-06, - "loss": 1.0824, - "step": 447 - }, - { - "epoch": 0.7449594678860944, - "grad_norm": 2.353863396233425, - "learning_rate": 1.7196913266537736e-06, - "loss": 1.1912, - "step": 448 - }, - { - "epoch": 0.7466223238411973, - "grad_norm": 2.0380687806247253, - "learning_rate": 1.7184737197800113e-06, - "loss": 1.0981, - "step": 449 - }, - { - "epoch": 0.7482851797963002, - "grad_norm": 2.5650954285686245, - "learning_rate": 1.717253907188477e-06, - "loss": 1.0268, - "step": 450 - }, - { - "epoch": 0.749948035751403, - "grad_norm": 2.3190693587689464, - "learning_rate": 1.7160318926240014e-06, - "loss": 1.1847, - "step": 451 - }, - { - "epoch": 0.751610891706506, - "grad_norm": 2.1059332821710783, - "learning_rate": 1.7148076798381754e-06, - "loss": 1.0832, - "step": 452 - }, - { - "epoch": 0.7532737476616088, - "grad_norm": 3.056086058439594, - "learning_rate": 1.713581272589338e-06, - "loss": 1.1343, - "step": 453 - }, - { - "epoch": 0.7532737476616088, - "eval_loss": 1.3934024572372437, - "eval_runtime": 24.6818, - "eval_samples_per_second": 0.446, - "eval_steps_per_second": 0.122, - "step": 453 - }, - { - "epoch": 0.7549366036167117, - "grad_norm": 2.4488905684680455, - "learning_rate": 1.7123526746425649e-06, - "loss": 1.0994, - "step": 454 - }, - { - "epoch": 0.7565994595718146, - "grad_norm": 2.6403911690075215, - "learning_rate": 1.7111218897696585e-06, - "loss": 1.0764, - "step": 455 - }, - { - "epoch": 0.7582623155269175, - "grad_norm": 2.1327294883754577, - "learning_rate": 1.7098889217491336e-06, - "loss": 1.1461, - "step": 456 - }, - { - "epoch": 0.7599251714820203, - "grad_norm": 2.0936769499266, - "learning_rate": 1.7086537743662084e-06, - "loss": 1.1827, - "step": 457 - }, - { - "epoch": 0.7615880274371233, - "grad_norm": 2.0368197792938814, - "learning_rate": 1.707416451412791e-06, - "loss": 1.0709, - "step": 458 - }, - { - "epoch": 0.7632508833922261, - "grad_norm": 2.1159264469009154, - "learning_rate": 1.7061769566874688e-06, - "loss": 1.1597, - "step": 459 - }, - { - "epoch": 0.7649137393473291, - "grad_norm": 2.2371503881264134, - "learning_rate": 1.7049352939954966e-06, - "loss": 1.1034, - "step": 460 - }, - { - "epoch": 0.7665765953024319, - "grad_norm": 2.167369770231308, - "learning_rate": 1.7036914671487849e-06, - "loss": 1.1398, - "step": 461 - }, - { - "epoch": 0.7682394512575348, - "grad_norm": 2.682964801749785, - "learning_rate": 1.7024454799658883e-06, - "loss": 1.1015, - "step": 462 - }, - { - "epoch": 0.7699023072126377, - "grad_norm": 2.3132284135399903, - "learning_rate": 1.7011973362719929e-06, - "loss": 0.9757, - "step": 463 - }, - { - "epoch": 0.7715651631677406, - "grad_norm": 2.226364690832632, - "learning_rate": 1.6999470398989066e-06, - "loss": 1.0315, - "step": 464 - }, - { - "epoch": 0.7732280191228434, - "grad_norm": 2.333679960095519, - "learning_rate": 1.6986945946850445e-06, - "loss": 1.1647, - "step": 465 - }, - { - "epoch": 0.7748908750779464, - "grad_norm": 2.1371349082975883, - "learning_rate": 1.6974400044754198e-06, - "loss": 1.0292, - "step": 466 - }, - { - "epoch": 0.7765537310330493, - "grad_norm": 2.044267477405619, - "learning_rate": 1.6961832731216305e-06, - "loss": 1.0265, - "step": 467 - }, - { - "epoch": 0.7782165869881521, - "grad_norm": 2.4049976847050987, - "learning_rate": 1.694924404481848e-06, - "loss": 1.0263, - "step": 468 - }, - { - "epoch": 0.7798794429432551, - "grad_norm": 2.1513258273801097, - "learning_rate": 1.6936634024208045e-06, - "loss": 1.0255, - "step": 469 - }, - { - "epoch": 0.7815422988983579, - "grad_norm": 2.267086285542123, - "learning_rate": 1.692400270809783e-06, - "loss": 1.1037, - "step": 470 - }, - { - "epoch": 0.7832051548534609, - "grad_norm": 2.207206827718745, - "learning_rate": 1.6911350135266034e-06, - "loss": 1.2229, - "step": 471 - }, - { - "epoch": 0.7848680108085637, - "grad_norm": 2.3370445622010267, - "learning_rate": 1.6898676344556116e-06, - "loss": 1.0456, - "step": 472 - }, - { - "epoch": 0.7865308667636666, - "grad_norm": 2.694579739504598, - "learning_rate": 1.6885981374876675e-06, - "loss": 1.1304, - "step": 473 - }, - { - "epoch": 0.7881937227187695, - "grad_norm": 2.4654672927693024, - "learning_rate": 1.6873265265201329e-06, - "loss": 1.1218, - "step": 474 - }, - { - "epoch": 0.7898565786738724, - "grad_norm": 2.4497013813355504, - "learning_rate": 1.6860528054568596e-06, - "loss": 1.188, - "step": 475 - }, - { - "epoch": 0.7915194346289752, - "grad_norm": 2.2437903296852517, - "learning_rate": 1.684776978208177e-06, - "loss": 1.0714, - "step": 476 - }, - { - "epoch": 0.7931822905840782, - "grad_norm": 2.430879750782421, - "learning_rate": 1.6834990486908816e-06, - "loss": 1.1451, - "step": 477 - }, - { - "epoch": 0.794845146539181, - "grad_norm": 2.1676565673117265, - "learning_rate": 1.6822190208282226e-06, - "loss": 1.0458, - "step": 478 - }, - { - "epoch": 0.7965080024942839, - "grad_norm": 2.179125948082596, - "learning_rate": 1.6809368985498918e-06, - "loss": 1.125, - "step": 479 - }, - { - "epoch": 0.7981708584493868, - "grad_norm": 2.277708398927927, - "learning_rate": 1.679652685792011e-06, - "loss": 1.0762, - "step": 480 - }, - { - "epoch": 0.7998337144044897, - "grad_norm": 2.2097528761591834, - "learning_rate": 1.6783663864971191e-06, - "loss": 1.0587, - "step": 481 - }, - { - "epoch": 0.8014965703595927, - "grad_norm": 1.9951583919167484, - "learning_rate": 1.6770780046141614e-06, - "loss": 1.0065, - "step": 482 - }, - { - "epoch": 0.8031594263146955, - "grad_norm": 2.1163449529859815, - "learning_rate": 1.6757875440984765e-06, - "loss": 0.9654, - "step": 483 - }, - { - "epoch": 0.8048222822697984, - "grad_norm": 2.169660817074707, - "learning_rate": 1.6744950089117845e-06, - "loss": 1.109, - "step": 484 - }, - { - "epoch": 0.8064851382249013, - "grad_norm": 2.5007409181660596, - "learning_rate": 1.6732004030221743e-06, - "loss": 1.0073, - "step": 485 - }, - { - "epoch": 0.8081479941800042, - "grad_norm": 2.382682682731147, - "learning_rate": 1.6719037304040921e-06, - "loss": 1.117, - "step": 486 - }, - { - "epoch": 0.809810850135107, - "grad_norm": 2.1404776200804827, - "learning_rate": 1.6706049950383299e-06, - "loss": 1.1485, - "step": 487 - }, - { - "epoch": 0.81147370609021, - "grad_norm": 1.9674458939644788, - "learning_rate": 1.6693042009120104e-06, - "loss": 0.9832, - "step": 488 - }, - { - "epoch": 0.8131365620453128, - "grad_norm": 2.428436601155877, - "learning_rate": 1.6680013520185786e-06, - "loss": 1.1419, - "step": 489 - }, - { - "epoch": 0.8147994180004157, - "grad_norm": 2.252054044332263, - "learning_rate": 1.6666964523577866e-06, - "loss": 1.1008, - "step": 490 - }, - { - "epoch": 0.8164622739555186, - "grad_norm": 2.3185844238230167, - "learning_rate": 1.6653895059356827e-06, - "loss": 1.0961, - "step": 491 - }, - { - "epoch": 0.8181251299106215, - "grad_norm": 2.245128772330143, - "learning_rate": 1.6640805167645984e-06, - "loss": 1.0219, - "step": 492 - }, - { - "epoch": 0.8197879858657244, - "grad_norm": 2.2252839355087404, - "learning_rate": 1.6627694888631374e-06, - "loss": 1.1808, - "step": 493 - }, - { - "epoch": 0.8214508418208273, - "grad_norm": 2.4802130292227487, - "learning_rate": 1.6614564262561608e-06, - "loss": 1.0952, - "step": 494 - }, - { - "epoch": 0.8231136977759301, - "grad_norm": 2.0600513070759594, - "learning_rate": 1.6601413329747778e-06, - "loss": 1.0461, - "step": 495 - }, - { - "epoch": 0.8247765537310331, - "grad_norm": 2.129782322403382, - "learning_rate": 1.6588242130563308e-06, - "loss": 1.0565, - "step": 496 - }, - { - "epoch": 0.8264394096861359, - "grad_norm": 2.24596861189698, - "learning_rate": 1.657505070544384e-06, - "loss": 1.2183, - "step": 497 - }, - { - "epoch": 0.8281022656412388, - "grad_norm": 2.2144169319672686, - "learning_rate": 1.6561839094887123e-06, - "loss": 1.056, - "step": 498 - }, - { - "epoch": 0.8297651215963417, - "grad_norm": 2.1326715488699346, - "learning_rate": 1.6548607339452852e-06, - "loss": 1.081, - "step": 499 - }, - { - "epoch": 0.8314279775514446, - "grad_norm": 2.400366586745486, - "learning_rate": 1.6535355479762584e-06, - "loss": 1.0388, - "step": 500 - }, - { - "epoch": 0.8330908335065474, - "grad_norm": 2.101276111938611, - "learning_rate": 1.6522083556499595e-06, - "loss": 1.13, - "step": 501 - }, - { - "epoch": 0.8347536894616504, - "grad_norm": 2.1474700452589017, - "learning_rate": 1.6508791610408751e-06, - "loss": 1.0743, - "step": 502 - }, - { - "epoch": 0.8364165454167533, - "grad_norm": 2.449496114081372, - "learning_rate": 1.6495479682296393e-06, - "loss": 1.1763, - "step": 503 - }, - { - "epoch": 0.8380794013718562, - "grad_norm": 2.2397418249566705, - "learning_rate": 1.6482147813030202e-06, - "loss": 1.0881, - "step": 504 - }, - { - "epoch": 0.8397422573269591, - "grad_norm": 2.38564204606019, - "learning_rate": 1.646879604353908e-06, - "loss": 0.9649, - "step": 505 - }, - { - "epoch": 0.8414051132820619, - "grad_norm": 2.51940590317821, - "learning_rate": 1.6455424414813024e-06, - "loss": 1.2114, - "step": 506 - }, - { - "epoch": 0.8430679692371649, - "grad_norm": 2.3181783821555175, - "learning_rate": 1.6442032967903e-06, - "loss": 1.1246, - "step": 507 - }, - { - "epoch": 0.8447308251922677, - "grad_norm": 2.159422324003234, - "learning_rate": 1.6428621743920812e-06, - "loss": 1.0351, - "step": 508 - }, - { - "epoch": 0.8463936811473706, - "grad_norm": 2.411802254120254, - "learning_rate": 1.6415190784038982e-06, - "loss": 1.0776, - "step": 509 - }, - { - "epoch": 0.8480565371024735, - "grad_norm": 2.0734882608633085, - "learning_rate": 1.6401740129490622e-06, - "loss": 1.1148, - "step": 510 - }, - { - "epoch": 0.8497193930575764, - "grad_norm": 2.318932880803621, - "learning_rate": 1.638826982156931e-06, - "loss": 1.0853, - "step": 511 - }, - { - "epoch": 0.8513822490126792, - "grad_norm": 2.1981900335907247, - "learning_rate": 1.637477990162895e-06, - "loss": 1.0151, - "step": 512 - }, - { - "epoch": 0.8530451049677822, - "grad_norm": 2.121049624802221, - "learning_rate": 1.6361270411083665e-06, - "loss": 1.0518, - "step": 513 - }, - { - "epoch": 0.854707960922885, - "grad_norm": 2.191890004201356, - "learning_rate": 1.6347741391407653e-06, - "loss": 1.0426, - "step": 514 - }, - { - "epoch": 0.856370816877988, - "grad_norm": 2.194218873534906, - "learning_rate": 1.6334192884135073e-06, - "loss": 1.1111, - "step": 515 - }, - { - "epoch": 0.8580336728330908, - "grad_norm": 2.0367103549252983, - "learning_rate": 1.6320624930859904e-06, - "loss": 1.0521, - "step": 516 - }, - { - "epoch": 0.8596965287881937, - "grad_norm": 2.5679919755305973, - "learning_rate": 1.630703757323583e-06, - "loss": 1.0547, - "step": 517 - }, - { - "epoch": 0.8613593847432967, - "grad_norm": 2.2355473603338676, - "learning_rate": 1.62934308529761e-06, - "loss": 1.0582, - "step": 518 - }, - { - "epoch": 0.8630222406983995, - "grad_norm": 2.04568580636059, - "learning_rate": 1.6279804811853418e-06, - "loss": 0.9508, - "step": 519 - }, - { - "epoch": 0.8646850966535023, - "grad_norm": 2.137101454934, - "learning_rate": 1.6266159491699787e-06, - "loss": 1.05, - "step": 520 - }, - { - "epoch": 0.8663479526086053, - "grad_norm": 2.3456035912178126, - "learning_rate": 1.6252494934406413e-06, - "loss": 1.0513, - "step": 521 - }, - { - "epoch": 0.8680108085637082, - "grad_norm": 2.3170057893277645, - "learning_rate": 1.6238811181923548e-06, - "loss": 1.0833, - "step": 522 - }, - { - "epoch": 0.869673664518811, - "grad_norm": 2.03138518708311, - "learning_rate": 1.6225108276260384e-06, - "loss": 1.0733, - "step": 523 - }, - { - "epoch": 0.871336520473914, - "grad_norm": 2.1936879157391225, - "learning_rate": 1.6211386259484898e-06, - "loss": 1.1578, - "step": 524 - }, - { - "epoch": 0.8729993764290168, - "grad_norm": 2.162845347014475, - "learning_rate": 1.6197645173723755e-06, - "loss": 1.0716, - "step": 525 - }, - { - "epoch": 0.8746622323841198, - "grad_norm": 2.2797022460412335, - "learning_rate": 1.6183885061162149e-06, - "loss": 1.1435, - "step": 526 - }, - { - "epoch": 0.8763250883392226, - "grad_norm": 2.8402768433436965, - "learning_rate": 1.6170105964043693e-06, - "loss": 1.0986, - "step": 527 - }, - { - "epoch": 0.8779879442943255, - "grad_norm": 2.2040976685633726, - "learning_rate": 1.6156307924670287e-06, - "loss": 1.1769, - "step": 528 - }, - { - "epoch": 0.8796508002494284, - "grad_norm": 2.2121586340352724, - "learning_rate": 1.614249098540197e-06, - "loss": 1.0384, - "step": 529 - }, - { - "epoch": 0.8813136562045313, - "grad_norm": 2.1737348089144772, - "learning_rate": 1.6128655188656818e-06, - "loss": 1.1178, - "step": 530 - }, - { - "epoch": 0.8829765121596341, - "grad_norm": 2.045692860407394, - "learning_rate": 1.6114800576910789e-06, - "loss": 1.0526, - "step": 531 - }, - { - "epoch": 0.8846393681147371, - "grad_norm": 2.4254673272096094, - "learning_rate": 1.610092719269761e-06, - "loss": 1.1918, - "step": 532 - }, - { - "epoch": 0.8863022240698399, - "grad_norm": 2.2186502378953463, - "learning_rate": 1.6087035078608636e-06, - "loss": 1.0816, - "step": 533 - }, - { - "epoch": 0.8879650800249428, - "grad_norm": 2.1899563735102316, - "learning_rate": 1.6073124277292726e-06, - "loss": 1.077, - "step": 534 - }, - { - "epoch": 0.8896279359800457, - "grad_norm": 2.3420002228619183, - "learning_rate": 1.6059194831456105e-06, - "loss": 1.046, - "step": 535 - }, - { - "epoch": 0.8912907919351486, - "grad_norm": 2.217136239350302, - "learning_rate": 1.6045246783862237e-06, - "loss": 1.0914, - "step": 536 - }, - { - "epoch": 0.8929536478902516, - "grad_norm": 2.6270358774502, - "learning_rate": 1.6031280177331704e-06, - "loss": 1.2156, - "step": 537 - }, - { - "epoch": 0.8946165038453544, - "grad_norm": 2.4382518238543596, - "learning_rate": 1.6017295054742044e-06, - "loss": 1.0995, - "step": 538 - }, - { - "epoch": 0.8962793598004573, - "grad_norm": 2.3688360741829917, - "learning_rate": 1.6003291459027653e-06, - "loss": 1.0843, - "step": 539 - }, - { - "epoch": 0.8979422157555602, - "grad_norm": 2.208126828921951, - "learning_rate": 1.5989269433179644e-06, - "loss": 1.1147, - "step": 540 - }, - { - "epoch": 0.8996050717106631, - "grad_norm": 2.2413007358436188, - "learning_rate": 1.5975229020245697e-06, - "loss": 1.1003, - "step": 541 - }, - { - "epoch": 0.9012679276657659, - "grad_norm": 2.226637939011591, - "learning_rate": 1.596117026332995e-06, - "loss": 0.9256, - "step": 542 - }, - { - "epoch": 0.9029307836208689, - "grad_norm": 2.117810788415696, - "learning_rate": 1.5947093205592851e-06, - "loss": 1.159, - "step": 543 - }, - { - "epoch": 0.9045936395759717, - "grad_norm": 2.0662799008829476, - "learning_rate": 1.593299789025104e-06, - "loss": 1.0951, - "step": 544 - }, - { - "epoch": 0.9062564955310746, - "grad_norm": 2.22383521867496, - "learning_rate": 1.5918884360577201e-06, - "loss": 1.0437, - "step": 545 - }, - { - "epoch": 0.9079193514861775, - "grad_norm": 2.1293539340887735, - "learning_rate": 1.5904752659899933e-06, - "loss": 1.1039, - "step": 546 - }, - { - "epoch": 0.9095822074412804, - "grad_norm": 2.1965800318638724, - "learning_rate": 1.5890602831603632e-06, - "loss": 1.0456, - "step": 547 - }, - { - "epoch": 0.9112450633963833, - "grad_norm": 2.084489657850369, - "learning_rate": 1.5876434919128334e-06, - "loss": 1.063, - "step": 548 - }, - { - "epoch": 0.9129079193514862, - "grad_norm": 2.167327544689684, - "learning_rate": 1.5862248965969603e-06, - "loss": 1.0414, - "step": 549 - }, - { - "epoch": 0.914570775306589, - "grad_norm": 2.0664162881784587, - "learning_rate": 1.584804501567838e-06, - "loss": 1.0985, - "step": 550 - }, - { - "epoch": 0.916233631261692, - "grad_norm": 2.319234065867708, - "learning_rate": 1.583382311186086e-06, - "loss": 0.9869, - "step": 551 - }, - { - "epoch": 0.9178964872167948, - "grad_norm": 2.2524367417375113, - "learning_rate": 1.581958329817836e-06, - "loss": 0.9564, - "step": 552 - }, - { - "epoch": 0.9195593431718977, - "grad_norm": 2.2424034311871184, - "learning_rate": 1.5805325618347169e-06, - "loss": 1.065, - "step": 553 - }, - { - "epoch": 0.9212221991270007, - "grad_norm": 2.337350211180452, - "learning_rate": 1.5791050116138438e-06, - "loss": 1.0277, - "step": 554 - }, - { - "epoch": 0.9228850550821035, - "grad_norm": 2.1419920030575206, - "learning_rate": 1.577675683537803e-06, - "loss": 0.9841, - "step": 555 - }, - { - "epoch": 0.9245479110372063, - "grad_norm": 2.2368638321185865, - "learning_rate": 1.5762445819946383e-06, - "loss": 1.1817, - "step": 556 - }, - { - "epoch": 0.9262107669923093, - "grad_norm": 2.1713305118749955, - "learning_rate": 1.5748117113778379e-06, - "loss": 1.0755, - "step": 557 - }, - { - "epoch": 0.9278736229474122, - "grad_norm": 2.288987960510324, - "learning_rate": 1.5733770760863219e-06, - "loss": 1.1393, - "step": 558 - }, - { - "epoch": 0.9295364789025151, - "grad_norm": 2.2628198071243526, - "learning_rate": 1.5719406805244274e-06, - "loss": 1.0763, - "step": 559 - }, - { - "epoch": 0.931199334857618, - "grad_norm": 2.164816303310072, - "learning_rate": 1.570502529101896e-06, - "loss": 1.1296, - "step": 560 - }, - { - "epoch": 0.9328621908127208, - "grad_norm": 2.198211513279556, - "learning_rate": 1.569062626233859e-06, - "loss": 1.1306, - "step": 561 - }, - { - "epoch": 0.9345250467678238, - "grad_norm": 2.0700768610981766, - "learning_rate": 1.5676209763408252e-06, - "loss": 1.0939, - "step": 562 - }, - { - "epoch": 0.9361879027229266, - "grad_norm": 2.2514400348833927, - "learning_rate": 1.5661775838486673e-06, - "loss": 1.1552, - "step": 563 - }, - { - "epoch": 0.9378507586780295, - "grad_norm": 2.090942303284215, - "learning_rate": 1.5647324531886064e-06, - "loss": 0.9839, - "step": 564 - }, - { - "epoch": 0.9395136146331324, - "grad_norm": 2.140109882341939, - "learning_rate": 1.5632855887972007e-06, - "loss": 1.2005, - "step": 565 - }, - { - "epoch": 0.9411764705882353, - "grad_norm": 1.9572836049893243, - "learning_rate": 1.5618369951163316e-06, - "loss": 1.0442, - "step": 566 - }, - { - "epoch": 0.9428393265433381, - "grad_norm": 2.247679565843514, - "learning_rate": 1.5603866765931874e-06, - "loss": 1.0675, - "step": 567 - }, - { - "epoch": 0.9445021824984411, - "grad_norm": 2.16539672713286, - "learning_rate": 1.558934637680254e-06, - "loss": 1.0969, - "step": 568 - }, - { - "epoch": 0.9461650384535439, - "grad_norm": 2.136321120880899, - "learning_rate": 1.5574808828352977e-06, - "loss": 1.0832, - "step": 569 - }, - { - "epoch": 0.9478278944086469, - "grad_norm": 2.35022238962833, - "learning_rate": 1.556025416521352e-06, - "loss": 1.0707, - "step": 570 - }, - { - "epoch": 0.9494907503637497, - "grad_norm": 2.2709844918364186, - "learning_rate": 1.5545682432067063e-06, - "loss": 1.1081, - "step": 571 - }, - { - "epoch": 0.9511536063188526, - "grad_norm": 2.2603031964486706, - "learning_rate": 1.5531093673648897e-06, - "loss": 1.1301, - "step": 572 - }, - { - "epoch": 0.9528164622739556, - "grad_norm": 2.168121102440009, - "learning_rate": 1.5516487934746575e-06, - "loss": 1.054, - "step": 573 - }, - { - "epoch": 0.9544793182290584, - "grad_norm": 2.2875944656650526, - "learning_rate": 1.5501865260199794e-06, - "loss": 1.0521, - "step": 574 - }, - { - "epoch": 0.9561421741841613, - "grad_norm": 2.034798392016362, - "learning_rate": 1.5487225694900222e-06, - "loss": 0.9311, - "step": 575 - }, - { - "epoch": 0.9578050301392642, - "grad_norm": 2.3324178914152967, - "learning_rate": 1.547256928379141e-06, - "loss": 1.2792, - "step": 576 - }, - { - "epoch": 0.9594678860943671, - "grad_norm": 2.12207756608058, - "learning_rate": 1.54578960718686e-06, - "loss": 1.0741, - "step": 577 - }, - { - "epoch": 0.9611307420494699, - "grad_norm": 2.3421431613806085, - "learning_rate": 1.5443206104178627e-06, - "loss": 1.126, - "step": 578 - }, - { - "epoch": 0.9627935980045729, - "grad_norm": 2.2122877182310714, - "learning_rate": 1.5428499425819764e-06, - "loss": 1.135, - "step": 579 - }, - { - "epoch": 0.9644564539596757, - "grad_norm": 2.2047878214189485, - "learning_rate": 1.5413776081941578e-06, - "loss": 1.1399, - "step": 580 - }, - { - "epoch": 0.9661193099147787, - "grad_norm": 2.236152079611479, - "learning_rate": 1.5399036117744811e-06, - "loss": 1.1442, - "step": 581 - }, - { - "epoch": 0.9677821658698815, - "grad_norm": 2.1601189484933476, - "learning_rate": 1.538427957848122e-06, - "loss": 1.0039, - "step": 582 - }, - { - "epoch": 0.9694450218249844, - "grad_norm": 2.060582773193502, - "learning_rate": 1.5369506509453455e-06, - "loss": 1.0841, - "step": 583 - }, - { - "epoch": 0.9711078777800873, - "grad_norm": 2.456082310651417, - "learning_rate": 1.5354716956014909e-06, - "loss": 1.1203, - "step": 584 - }, - { - "epoch": 0.9727707337351902, - "grad_norm": 2.047282112632501, - "learning_rate": 1.5339910963569583e-06, - "loss": 0.9737, - "step": 585 - }, - { - "epoch": 0.974433589690293, - "grad_norm": 2.0171083884034444, - "learning_rate": 1.5325088577571937e-06, - "loss": 1.0301, - "step": 586 - }, - { - "epoch": 0.976096445645396, - "grad_norm": 2.3643508794229513, - "learning_rate": 1.5310249843526774e-06, - "loss": 1.1398, - "step": 587 - }, - { - "epoch": 0.9777593016004988, - "grad_norm": 2.167267117059674, - "learning_rate": 1.5295394806989076e-06, - "loss": 1.1058, - "step": 588 - }, - { - "epoch": 0.9794221575556017, - "grad_norm": 2.1579109019121674, - "learning_rate": 1.5280523513563884e-06, - "loss": 0.9808, - "step": 589 - }, - { - "epoch": 0.9810850135107047, - "grad_norm": 2.2214933213880776, - "learning_rate": 1.526563600890613e-06, - "loss": 1.2226, - "step": 590 - }, - { - "epoch": 0.9827478694658075, - "grad_norm": 2.2694359795676085, - "learning_rate": 1.525073233872053e-06, - "loss": 1.0298, - "step": 591 - }, - { - "epoch": 0.9844107254209105, - "grad_norm": 2.115482333445717, - "learning_rate": 1.5235812548761424e-06, - "loss": 1.0619, - "step": 592 - }, - { - "epoch": 0.9860735813760133, - "grad_norm": 2.2939214488584883, - "learning_rate": 1.5220876684832638e-06, - "loss": 1.2193, - "step": 593 - }, - { - "epoch": 0.9877364373311162, - "grad_norm": 2.145954515190259, - "learning_rate": 1.5205924792787344e-06, - "loss": 1.1042, - "step": 594 - }, - { - "epoch": 0.9893992932862191, - "grad_norm": 2.1711020269771675, - "learning_rate": 1.5190956918527925e-06, - "loss": 1.0768, - "step": 595 - }, - { - "epoch": 0.991062149241322, - "grad_norm": 2.349827442265106, - "learning_rate": 1.517597310800582e-06, - "loss": 1.1737, - "step": 596 - }, - { - "epoch": 0.9927250051964248, - "grad_norm": 2.1677338271015074, - "learning_rate": 1.516097340722141e-06, - "loss": 1.1894, - "step": 597 - }, - { - "epoch": 0.9943878611515278, - "grad_norm": 2.37315650850326, - "learning_rate": 1.5145957862223843e-06, - "loss": 0.9953, - "step": 598 - }, - { - "epoch": 0.9960507171066306, - "grad_norm": 2.115914446964703, - "learning_rate": 1.5130926519110914e-06, - "loss": 1.1486, - "step": 599 - }, - { - "epoch": 0.9977135730617335, - "grad_norm": 2.2944478230358034, - "learning_rate": 1.5115879424028918e-06, - "loss": 1.01, - "step": 600 - }, - { - "epoch": 0.9993764290168364, - "grad_norm": 2.137749774673581, - "learning_rate": 1.5100816623172512e-06, - "loss": 1.1942, - "step": 601 - }, - { - "epoch": 1.0010392849719394, - "grad_norm": 2.1518759789580555, - "learning_rate": 1.5085738162784565e-06, - "loss": 1.0448, - "step": 602 - }, - { - "epoch": 1.0027021409270422, - "grad_norm": 2.0692500287896634, - "learning_rate": 1.5070644089156024e-06, - "loss": 1.0378, - "step": 603 - }, - { - "epoch": 1.0016614745586707, - "grad_norm": 2.230842725197309, - "learning_rate": 1.5055534448625764e-06, - "loss": 0.9973, - "step": 604 - }, - { - "epoch": 1.0016614745586707, - "eval_loss": 1.389662504196167, - "eval_runtime": 24.637, - "eval_samples_per_second": 0.446, - "eval_steps_per_second": 0.122, - "step": 604 - }, - { - "epoch": 1.0033229491173417, - "grad_norm": 2.009928207068121, - "learning_rate": 1.5040409287580457e-06, - "loss": 1.0293, - "step": 605 - }, - { - "epoch": 1.0049844236760124, - "grad_norm": 2.120439791656414, - "learning_rate": 1.5025268652454421e-06, - "loss": 1.048, - "step": 606 - }, - { - "epoch": 1.0066458982346833, - "grad_norm": 2.04371411144112, - "learning_rate": 1.501011258972948e-06, - "loss": 1.0411, - "step": 607 - }, - { - "epoch": 1.008307372793354, - "grad_norm": 1.9287047292797497, - "learning_rate": 1.4994941145934815e-06, - "loss": 0.8436, - "step": 608 - }, - { - "epoch": 1.009968847352025, - "grad_norm": 2.0850070547651067, - "learning_rate": 1.4979754367646833e-06, - "loss": 0.9103, - "step": 609 - }, - { - "epoch": 1.0116303219106957, - "grad_norm": 2.0786145289365203, - "learning_rate": 1.4964552301489018e-06, - "loss": 0.9403, - "step": 610 - }, - { - "epoch": 1.0132917964693666, - "grad_norm": 2.2212349830956652, - "learning_rate": 1.494933499413178e-06, - "loss": 1.1803, - "step": 611 - }, - { - "epoch": 1.0149532710280373, - "grad_norm": 2.2692136037661843, - "learning_rate": 1.4934102492292336e-06, - "loss": 0.9701, - "step": 612 - }, - { - "epoch": 1.0166147455867083, - "grad_norm": 2.0584997416327253, - "learning_rate": 1.491885484273453e-06, - "loss": 1.0582, - "step": 613 - }, - { - "epoch": 1.018276220145379, - "grad_norm": 2.259916957406716, - "learning_rate": 1.4903592092268726e-06, - "loss": 0.9255, - "step": 614 - }, - { - "epoch": 1.01993769470405, - "grad_norm": 2.848368582871536, - "learning_rate": 1.4888314287751638e-06, - "loss": 0.9602, - "step": 615 - }, - { - "epoch": 1.0215991692627207, - "grad_norm": 2.760714314244588, - "learning_rate": 1.48730214760862e-06, - "loss": 0.9578, - "step": 616 - }, - { - "epoch": 1.0232606438213916, - "grad_norm": 2.23754202122278, - "learning_rate": 1.4857713704221419e-06, - "loss": 1.0313, - "step": 617 - }, - { - "epoch": 1.0249221183800623, - "grad_norm": 2.348557579974889, - "learning_rate": 1.4842391019152225e-06, - "loss": 1.0304, - "step": 618 - }, - { - "epoch": 1.026583592938733, - "grad_norm": 2.3247250951347485, - "learning_rate": 1.482705346791934e-06, - "loss": 0.9455, - "step": 619 - }, - { - "epoch": 1.028245067497404, - "grad_norm": 2.0361498175354997, - "learning_rate": 1.481170109760911e-06, - "loss": 0.9218, - "step": 620 - }, - { - "epoch": 1.0299065420560747, - "grad_norm": 2.214215564027277, - "learning_rate": 1.4796333955353395e-06, - "loss": 1.0163, - "step": 621 - }, - { - "epoch": 1.0315680166147456, - "grad_norm": 2.1719690959578037, - "learning_rate": 1.4780952088329394e-06, - "loss": 1.0134, - "step": 622 - }, - { - "epoch": 1.0332294911734163, - "grad_norm": 2.2204516984962357, - "learning_rate": 1.476555554375951e-06, - "loss": 0.9959, - "step": 623 - }, - { - "epoch": 1.0348909657320873, - "grad_norm": 2.184066671975778, - "learning_rate": 1.4750144368911207e-06, - "loss": 0.9662, - "step": 624 - }, - { - "epoch": 1.036552440290758, - "grad_norm": 2.265473071872864, - "learning_rate": 1.4734718611096874e-06, - "loss": 0.893, - "step": 625 - }, - { - "epoch": 1.038213914849429, - "grad_norm": 2.2673393121172083, - "learning_rate": 1.4719278317673654e-06, - "loss": 1.0177, - "step": 626 - }, - { - "epoch": 1.0398753894080996, - "grad_norm": 2.3668732645254806, - "learning_rate": 1.4703823536043324e-06, - "loss": 0.983, - "step": 627 - }, - { - "epoch": 1.0415368639667706, - "grad_norm": 2.131498764332704, - "learning_rate": 1.468835431365214e-06, - "loss": 1.0071, - "step": 628 - }, - { - "epoch": 1.0431983385254413, - "grad_norm": 2.149621804291588, - "learning_rate": 1.4672870697990686e-06, - "loss": 0.9003, - "step": 629 - }, - { - "epoch": 1.0448598130841122, - "grad_norm": 2.159649238111743, - "learning_rate": 1.4657372736593736e-06, - "loss": 1.0028, - "step": 630 - }, - { - "epoch": 1.046521287642783, - "grad_norm": 2.1986785581325, - "learning_rate": 1.464186047704011e-06, - "loss": 1.1123, - "step": 631 - }, - { - "epoch": 1.0481827622014537, - "grad_norm": 2.2105863636515726, - "learning_rate": 1.4626333966952518e-06, - "loss": 0.9999, - "step": 632 - }, - { - "epoch": 1.0498442367601246, - "grad_norm": 2.233749646735216, - "learning_rate": 1.4610793253997419e-06, - "loss": 0.9993, - "step": 633 - }, - { - "epoch": 1.0515057113187953, - "grad_norm": 2.143359223529294, - "learning_rate": 1.4595238385884878e-06, - "loss": 0.9324, - "step": 634 - }, - { - "epoch": 1.0531671858774663, - "grad_norm": 2.1270783200542325, - "learning_rate": 1.4579669410368412e-06, - "loss": 0.913, - "step": 635 - }, - { - "epoch": 1.054828660436137, - "grad_norm": 2.311976174877509, - "learning_rate": 1.4564086375244854e-06, - "loss": 1.0857, - "step": 636 - }, - { - "epoch": 1.056490134994808, - "grad_norm": 2.1242450037351333, - "learning_rate": 1.4548489328354194e-06, - "loss": 1.0415, - "step": 637 - }, - { - "epoch": 1.0581516095534786, - "grad_norm": 2.1489145088861554, - "learning_rate": 1.4532878317579443e-06, - "loss": 1.0571, - "step": 638 - }, - { - "epoch": 1.0598130841121496, - "grad_norm": 2.1748830811019055, - "learning_rate": 1.451725339084648e-06, - "loss": 1.1093, - "step": 639 - }, - { - "epoch": 1.0614745586708203, - "grad_norm": 2.038403380979606, - "learning_rate": 1.4501614596123897e-06, - "loss": 0.8422, - "step": 640 - }, - { - "epoch": 1.0631360332294912, - "grad_norm": 2.2259593245678806, - "learning_rate": 1.4485961981422882e-06, - "loss": 1.125, - "step": 641 - }, - { - "epoch": 1.064797507788162, - "grad_norm": 2.14202683520297, - "learning_rate": 1.4470295594797028e-06, - "loss": 0.9507, - "step": 642 - }, - { - "epoch": 1.066458982346833, - "grad_norm": 2.304225430064475, - "learning_rate": 1.4454615484342222e-06, - "loss": 0.947, - "step": 643 - }, - { - "epoch": 1.0681204569055036, - "grad_norm": 2.469583985925475, - "learning_rate": 1.4438921698196477e-06, - "loss": 0.9516, - "step": 644 - }, - { - "epoch": 1.0697819314641746, - "grad_norm": 2.079662445475448, - "learning_rate": 1.4423214284539787e-06, - "loss": 0.9201, - "step": 645 - }, - { - "epoch": 1.0714434060228453, - "grad_norm": 2.2706631611703108, - "learning_rate": 1.4407493291593992e-06, - "loss": 0.9743, - "step": 646 - }, - { - "epoch": 1.073104880581516, - "grad_norm": 2.3017848193416053, - "learning_rate": 1.439175876762262e-06, - "loss": 1.0577, - "step": 647 - }, - { - "epoch": 1.074766355140187, - "grad_norm": 2.118285473386722, - "learning_rate": 1.4376010760930727e-06, - "loss": 0.8594, - "step": 648 - }, - { - "epoch": 1.0764278296988576, - "grad_norm": 2.071039252515359, - "learning_rate": 1.4360249319864775e-06, - "loss": 0.8875, - "step": 649 - }, - { - "epoch": 1.0780893042575286, - "grad_norm": 2.0620465316960934, - "learning_rate": 1.434447449281246e-06, - "loss": 1.0298, - "step": 650 - }, - { - "epoch": 1.0797507788161993, - "grad_norm": 2.3185372556623025, - "learning_rate": 1.432868632820258e-06, - "loss": 0.9762, - "step": 651 - }, - { - "epoch": 1.0814122533748702, - "grad_norm": 2.6906645034252317, - "learning_rate": 1.4312884874504873e-06, - "loss": 1.0237, - "step": 652 - }, - { - "epoch": 1.083073727933541, - "grad_norm": 2.1616057770804815, - "learning_rate": 1.4297070180229881e-06, - "loss": 0.9281, - "step": 653 - }, - { - "epoch": 1.084735202492212, - "grad_norm": 2.117360636691568, - "learning_rate": 1.4281242293928787e-06, - "loss": 0.8539, - "step": 654 - }, - { - "epoch": 1.0863966770508826, - "grad_norm": 2.3065809797011667, - "learning_rate": 1.4265401264193284e-06, - "loss": 1.0164, - "step": 655 - }, - { - "epoch": 1.0880581516095535, - "grad_norm": 2.2835922952642407, - "learning_rate": 1.4249547139655408e-06, - "loss": 0.9527, - "step": 656 - }, - { - "epoch": 1.0897196261682243, - "grad_norm": 2.2551385893064353, - "learning_rate": 1.4233679968987392e-06, - "loss": 1.189, - "step": 657 - }, - { - "epoch": 1.0913811007268952, - "grad_norm": 2.087220306176795, - "learning_rate": 1.421779980090153e-06, - "loss": 0.9813, - "step": 658 - }, - { - "epoch": 1.093042575285566, - "grad_norm": 2.291653419238335, - "learning_rate": 1.4201906684150019e-06, - "loss": 1.0983, - "step": 659 - }, - { - "epoch": 1.0947040498442369, - "grad_norm": 2.174357913738222, - "learning_rate": 1.4186000667524794e-06, - "loss": 1.0493, - "step": 660 - }, - { - "epoch": 1.0963655244029076, - "grad_norm": 2.3975146777189793, - "learning_rate": 1.417008179985741e-06, - "loss": 1.0219, - "step": 661 - }, - { - "epoch": 1.0980269989615783, - "grad_norm": 2.0940561543065135, - "learning_rate": 1.4154150130018865e-06, - "loss": 1.09, - "step": 662 - }, - { - "epoch": 1.0996884735202492, - "grad_norm": 2.376622784313192, - "learning_rate": 1.4138205706919459e-06, - "loss": 0.9597, - "step": 663 - }, - { - "epoch": 1.10134994807892, - "grad_norm": 2.504249815738343, - "learning_rate": 1.4122248579508655e-06, - "loss": 1.0957, - "step": 664 - }, - { - "epoch": 1.1030114226375909, - "grad_norm": 2.126205812177635, - "learning_rate": 1.41062787967749e-06, - "loss": 0.9934, - "step": 665 - }, - { - "epoch": 1.1046728971962616, - "grad_norm": 2.169506367867318, - "learning_rate": 1.4090296407745514e-06, - "loss": 0.9573, - "step": 666 - }, - { - "epoch": 1.1063343717549325, - "grad_norm": 2.2665210150054893, - "learning_rate": 1.4074301461486504e-06, - "loss": 1.0109, - "step": 667 - }, - { - "epoch": 1.1079958463136033, - "grad_norm": 2.154696910395812, - "learning_rate": 1.4058294007102431e-06, - "loss": 0.9593, - "step": 668 - }, - { - "epoch": 1.1096573208722742, - "grad_norm": 2.16252295450875, - "learning_rate": 1.4042274093736256e-06, - "loss": 1.0709, - "step": 669 - }, - { - "epoch": 1.111318795430945, - "grad_norm": 2.0289971880838116, - "learning_rate": 1.4026241770569196e-06, - "loss": 0.9739, - "step": 670 - }, - { - "epoch": 1.1129802699896159, - "grad_norm": 2.060249988117431, - "learning_rate": 1.4010197086820552e-06, - "loss": 0.9788, - "step": 671 - }, - { - "epoch": 1.1146417445482866, - "grad_norm": 2.169402054591631, - "learning_rate": 1.3994140091747586e-06, - "loss": 1.0504, - "step": 672 - }, - { - "epoch": 1.1163032191069575, - "grad_norm": 2.5824180613309844, - "learning_rate": 1.3978070834645348e-06, - "loss": 1.0102, - "step": 673 - }, - { - "epoch": 1.1179646936656282, - "grad_norm": 2.2309401801547173, - "learning_rate": 1.3961989364846532e-06, - "loss": 0.8809, - "step": 674 - }, - { - "epoch": 1.1196261682242992, - "grad_norm": 2.1738166442416134, - "learning_rate": 1.3945895731721331e-06, - "loss": 0.9891, - "step": 675 - }, - { - "epoch": 1.1212876427829699, - "grad_norm": 2.264956154758612, - "learning_rate": 1.3929789984677277e-06, - "loss": 1.03, - "step": 676 - }, - { - "epoch": 1.1229491173416406, - "grad_norm": 2.2123647976197796, - "learning_rate": 1.3913672173159085e-06, - "loss": 1.0692, - "step": 677 - }, - { - "epoch": 1.1246105919003115, - "grad_norm": 2.364489969539715, - "learning_rate": 1.3897542346648523e-06, - "loss": 1.0008, - "step": 678 - }, - { - "epoch": 1.1262720664589823, - "grad_norm": 2.1920721564053407, - "learning_rate": 1.3881400554664227e-06, - "loss": 0.9403, - "step": 679 - }, - { - "epoch": 1.1279335410176532, - "grad_norm": 2.0979599935589643, - "learning_rate": 1.386524684676158e-06, - "loss": 1.0091, - "step": 680 - }, - { - "epoch": 1.129595015576324, - "grad_norm": 2.278663731270055, - "learning_rate": 1.3849081272532544e-06, - "loss": 1.0526, - "step": 681 - }, - { - "epoch": 1.1312564901349949, - "grad_norm": 2.1605440095876753, - "learning_rate": 1.3832903881605507e-06, - "loss": 1.0356, - "step": 682 - }, - { - "epoch": 1.1329179646936656, - "grad_norm": 2.0718604939036136, - "learning_rate": 1.381671472364514e-06, - "loss": 0.9792, - "step": 683 - }, - { - "epoch": 1.1345794392523365, - "grad_norm": 2.1463194438640265, - "learning_rate": 1.380051384835223e-06, - "loss": 0.9814, - "step": 684 - }, - { - "epoch": 1.1362409138110072, - "grad_norm": 2.091677003972167, - "learning_rate": 1.3784301305463545e-06, - "loss": 1.0427, - "step": 685 - }, - { - "epoch": 1.1379023883696782, - "grad_norm": 2.2700678556630494, - "learning_rate": 1.3768077144751674e-06, - "loss": 0.9369, - "step": 686 - }, - { - "epoch": 1.1395638629283489, - "grad_norm": 2.0625458347908943, - "learning_rate": 1.3751841416024862e-06, - "loss": 0.8808, - "step": 687 - }, - { - "epoch": 1.1412253374870198, - "grad_norm": 2.3876992754405912, - "learning_rate": 1.3735594169126878e-06, - "loss": 0.9346, - "step": 688 - }, - { - "epoch": 1.1428868120456905, - "grad_norm": 8.47258353126556, - "learning_rate": 1.3719335453936844e-06, - "loss": 0.9763, - "step": 689 - }, - { - "epoch": 1.1445482866043615, - "grad_norm": 2.1087038863817478, - "learning_rate": 1.37030653203691e-06, - "loss": 1.159, - "step": 690 - }, - { - "epoch": 1.1462097611630322, - "grad_norm": 2.127868180262951, - "learning_rate": 1.3686783818373026e-06, - "loss": 1.0057, - "step": 691 - }, - { - "epoch": 1.147871235721703, - "grad_norm": 2.196072659767744, - "learning_rate": 1.367049099793292e-06, - "loss": 0.9821, - "step": 692 - }, - { - "epoch": 1.1495327102803738, - "grad_norm": 2.199287353415817, - "learning_rate": 1.3654186909067815e-06, - "loss": 1.0796, - "step": 693 - }, - { - "epoch": 1.1511941848390446, - "grad_norm": 2.265025353172786, - "learning_rate": 1.3637871601831338e-06, - "loss": 0.8848, - "step": 694 - }, - { - "epoch": 1.1528556593977155, - "grad_norm": 2.199148208887257, - "learning_rate": 1.3621545126311569e-06, - "loss": 0.8418, - "step": 695 - }, - { - "epoch": 1.1545171339563862, - "grad_norm": 2.1180720226692418, - "learning_rate": 1.3605207532630863e-06, - "loss": 0.8909, - "step": 696 - }, - { - "epoch": 1.1561786085150572, - "grad_norm": 2.2219661239549513, - "learning_rate": 1.358885887094571e-06, - "loss": 1.1277, - "step": 697 - }, - { - "epoch": 1.1578400830737279, - "grad_norm": 2.3215590968893474, - "learning_rate": 1.3572499191446578e-06, - "loss": 0.9389, - "step": 698 - }, - { - "epoch": 1.1595015576323988, - "grad_norm": 2.0426465674972407, - "learning_rate": 1.355612854435776e-06, - "loss": 0.9844, - "step": 699 - }, - { - "epoch": 1.1611630321910695, - "grad_norm": 2.3030330331075555, - "learning_rate": 1.3539746979937233e-06, - "loss": 0.997, - "step": 700 - }, - { - "epoch": 1.1628245067497405, - "grad_norm": 2.2497492700696475, - "learning_rate": 1.3523354548476466e-06, - "loss": 0.8244, - "step": 701 - }, - { - "epoch": 1.1644859813084112, - "grad_norm": 2.585474730795241, - "learning_rate": 1.3506951300300309e-06, - "loss": 1.0063, - "step": 702 - }, - { - "epoch": 1.1661474558670821, - "grad_norm": 2.041245335602706, - "learning_rate": 1.3490537285766808e-06, - "loss": 1.0547, - "step": 703 - }, - { - "epoch": 1.1678089304257528, - "grad_norm": 2.477977632976126, - "learning_rate": 1.347411255526707e-06, - "loss": 1.0055, - "step": 704 - }, - { - "epoch": 1.1694704049844238, - "grad_norm": 2.2319941565239425, - "learning_rate": 1.3457677159225095e-06, - "loss": 0.9844, - "step": 705 - }, - { - "epoch": 1.1711318795430945, - "grad_norm": 2.1658358037506784, - "learning_rate": 1.3441231148097628e-06, - "loss": 1.0095, - "step": 706 - }, - { - "epoch": 1.1727933541017652, - "grad_norm": 2.046058505379159, - "learning_rate": 1.3424774572374004e-06, - "loss": 1.0553, - "step": 707 - }, - { - "epoch": 1.1744548286604362, - "grad_norm": 2.1891816920862364, - "learning_rate": 1.340830748257599e-06, - "loss": 0.9626, - "step": 708 - }, - { - "epoch": 1.1761163032191069, - "grad_norm": 2.866582842225965, - "learning_rate": 1.3391829929257623e-06, - "loss": 1.0702, - "step": 709 - }, - { - "epoch": 1.1777777777777778, - "grad_norm": 2.076967565191604, - "learning_rate": 1.337534196300508e-06, - "loss": 0.9365, - "step": 710 - }, - { - "epoch": 1.1794392523364485, - "grad_norm": 2.1835016754122063, - "learning_rate": 1.3358843634436495e-06, - "loss": 1.0592, - "step": 711 - }, - { - "epoch": 1.1811007268951195, - "grad_norm": 2.219299561319535, - "learning_rate": 1.3342334994201814e-06, - "loss": 1.0805, - "step": 712 - }, - { - "epoch": 1.1827622014537902, - "grad_norm": 2.147024039114467, - "learning_rate": 1.332581609298264e-06, - "loss": 0.9914, - "step": 713 - }, - { - "epoch": 1.1844236760124611, - "grad_norm": 2.0325479434123457, - "learning_rate": 1.3309286981492082e-06, - "loss": 0.8889, - "step": 714 - }, - { - "epoch": 1.1860851505711318, - "grad_norm": 2.5059245553533858, - "learning_rate": 1.3292747710474592e-06, - "loss": 1.1349, - "step": 715 - }, - { - "epoch": 1.1877466251298028, - "grad_norm": 2.2414243259263524, - "learning_rate": 1.327619833070581e-06, - "loss": 0.8834, - "step": 716 - }, - { - "epoch": 1.1894080996884735, - "grad_norm": 2.2835729231057336, - "learning_rate": 1.3259638892992411e-06, - "loss": 1.0753, - "step": 717 - }, - { - "epoch": 1.1910695742471444, - "grad_norm": 2.1983108569959553, - "learning_rate": 1.3243069448171951e-06, - "loss": 0.9993, - "step": 718 - }, - { - "epoch": 1.1927310488058152, - "grad_norm": 2.1493768529520776, - "learning_rate": 1.3226490047112702e-06, - "loss": 0.9992, - "step": 719 - }, - { - "epoch": 1.194392523364486, - "grad_norm": 2.1982177635859474, - "learning_rate": 1.3209900740713506e-06, - "loss": 1.0959, - "step": 720 - }, - { - "epoch": 1.1960539979231568, - "grad_norm": 2.8542968450301105, - "learning_rate": 1.3193301579903615e-06, - "loss": 1.0538, - "step": 721 - }, - { - "epoch": 1.1977154724818275, - "grad_norm": 2.1619403628202876, - "learning_rate": 1.317669261564253e-06, - "loss": 1.0251, - "step": 722 - }, - { - "epoch": 1.1993769470404985, - "grad_norm": 2.418169263907194, - "learning_rate": 1.3160073898919852e-06, - "loss": 1.0045, - "step": 723 - }, - { - "epoch": 1.2010384215991692, - "grad_norm": 2.255661060936539, - "learning_rate": 1.3143445480755122e-06, - "loss": 0.8968, - "step": 724 - }, - { - "epoch": 1.2026998961578401, - "grad_norm": 2.272919278531151, - "learning_rate": 1.3126807412197664e-06, - "loss": 0.9584, - "step": 725 - }, - { - "epoch": 1.2043613707165108, - "grad_norm": 2.1840708120633594, - "learning_rate": 1.3110159744326426e-06, - "loss": 1.0289, - "step": 726 - }, - { - "epoch": 1.2060228452751818, - "grad_norm": 2.19563599663801, - "learning_rate": 1.3093502528249828e-06, - "loss": 0.9625, - "step": 727 - }, - { - "epoch": 1.2076843198338525, - "grad_norm": 3.020937698837689, - "learning_rate": 1.307683581510561e-06, - "loss": 0.8759, - "step": 728 - }, - { - "epoch": 1.2093457943925234, - "grad_norm": 2.246654200444507, - "learning_rate": 1.3060159656060653e-06, - "loss": 1.2153, - "step": 729 - }, - { - "epoch": 1.2110072689511941, - "grad_norm": 2.0769588680295774, - "learning_rate": 1.304347410231085e-06, - "loss": 1.0093, - "step": 730 - }, - { - "epoch": 1.212668743509865, - "grad_norm": 2.288118936663974, - "learning_rate": 1.3026779205080931e-06, - "loss": 1.0355, - "step": 731 - }, - { - "epoch": 1.2143302180685358, - "grad_norm": 2.184081985214361, - "learning_rate": 1.3010075015624308e-06, - "loss": 0.9306, - "step": 732 - }, - { - "epoch": 1.2159916926272065, - "grad_norm": 2.1299774896655106, - "learning_rate": 1.2993361585222927e-06, - "loss": 0.9958, - "step": 733 - }, - { - "epoch": 1.2176531671858775, - "grad_norm": 2.175753135707468, - "learning_rate": 1.2976638965187094e-06, - "loss": 0.9293, - "step": 734 - }, - { - "epoch": 1.2193146417445484, - "grad_norm": 2.2923713334477536, - "learning_rate": 1.295990720685534e-06, - "loss": 1.0258, - "step": 735 - }, - { - "epoch": 1.2209761163032191, - "grad_norm": 2.2957849147388525, - "learning_rate": 1.294316636159424e-06, - "loss": 0.9638, - "step": 736 - }, - { - "epoch": 1.2226375908618898, - "grad_norm": 2.250938246343808, - "learning_rate": 1.2926416480798267e-06, - "loss": 0.8653, - "step": 737 - }, - { - "epoch": 1.2242990654205608, - "grad_norm": 2.142325631732433, - "learning_rate": 1.2909657615889638e-06, - "loss": 0.9697, - "step": 738 - }, - { - "epoch": 1.2259605399792315, - "grad_norm": 2.039460360724837, - "learning_rate": 1.289288981831815e-06, - "loss": 0.9789, - "step": 739 - }, - { - "epoch": 1.2276220145379024, - "grad_norm": 2.2252231947797054, - "learning_rate": 1.2876113139561018e-06, - "loss": 1.1306, - "step": 740 - }, - { - "epoch": 1.2292834890965731, - "grad_norm": 2.101017091432986, - "learning_rate": 1.285932763112273e-06, - "loss": 0.9276, - "step": 741 - }, - { - "epoch": 1.230944963655244, - "grad_norm": 2.1947947768274334, - "learning_rate": 1.2842533344534875e-06, - "loss": 0.9702, - "step": 742 - }, - { - "epoch": 1.2326064382139148, - "grad_norm": 2.0326132196001545, - "learning_rate": 1.2825730331355995e-06, - "loss": 0.868, - "step": 743 - }, - { - "epoch": 1.2342679127725857, - "grad_norm": 2.148348415668642, - "learning_rate": 1.2808918643171423e-06, - "loss": 0.9215, - "step": 744 - }, - { - "epoch": 1.2359293873312565, - "grad_norm": 2.2147832136292327, - "learning_rate": 1.279209833159312e-06, - "loss": 0.9085, - "step": 745 - }, - { - "epoch": 1.2375908618899274, - "grad_norm": 2.2716999685750796, - "learning_rate": 1.2775269448259524e-06, - "loss": 0.9814, - "step": 746 - }, - { - "epoch": 1.2392523364485981, - "grad_norm": 2.294321169355599, - "learning_rate": 1.275843204483539e-06, - "loss": 1.0459, - "step": 747 - }, - { - "epoch": 1.2409138110072688, - "grad_norm": 2.0155276917734435, - "learning_rate": 1.2741586173011623e-06, - "loss": 0.9093, - "step": 748 - }, - { - "epoch": 1.2425752855659398, - "grad_norm": 2.176244110291096, - "learning_rate": 1.2724731884505134e-06, - "loss": 1.0437, - "step": 749 - }, - { - "epoch": 1.2442367601246107, - "grad_norm": 2.060993253717314, - "learning_rate": 1.2707869231058665e-06, - "loss": 0.9021, - "step": 750 - }, - { - "epoch": 1.2458982346832814, - "grad_norm": 2.2567394830685688, - "learning_rate": 1.2690998264440651e-06, - "loss": 1.0458, - "step": 751 - }, - { - "epoch": 1.2475597092419521, - "grad_norm": 2.1356542645695917, - "learning_rate": 1.2674119036445034e-06, - "loss": 0.8728, - "step": 752 - }, - { - "epoch": 1.249221183800623, - "grad_norm": 2.146513267279053, - "learning_rate": 1.2657231598891125e-06, - "loss": 0.9412, - "step": 753 - }, - { - "epoch": 1.2508826583592938, - "grad_norm": 2.237050919851434, - "learning_rate": 1.2640336003623442e-06, - "loss": 0.9418, - "step": 754 - }, - { - "epoch": 1.2525441329179647, - "grad_norm": 2.309743390797678, - "learning_rate": 1.2623432302511542e-06, - "loss": 1.1053, - "step": 755 - }, - { - "epoch": 1.2525441329179647, - "eval_loss": 1.3914175033569336, - "eval_runtime": 24.4622, - "eval_samples_per_second": 0.45, - "eval_steps_per_second": 0.123, - "step": 755 - }, - { - "epoch": 1.257528556593977, - "grad_norm": 2.194516385421637, - "learning_rate": 1.260652054744987e-06, - "loss": 1.0752, - "step": 756 - }, - { - "epoch": 1.259190031152648, - "grad_norm": 2.0798826656496514, - "learning_rate": 1.258960079035759e-06, - "loss": 1.0474, - "step": 757 - }, - { - "epoch": 1.2608515057113188, - "grad_norm": 2.4630029006380663, - "learning_rate": 1.2572673083178447e-06, - "loss": 1.0878, - "step": 758 - }, - { - "epoch": 1.2625129802699897, - "grad_norm": 2.059087102554878, - "learning_rate": 1.2555737477880575e-06, - "loss": 0.9406, - "step": 759 - }, - { - "epoch": 1.2641744548286604, - "grad_norm": 2.0994176763702717, - "learning_rate": 1.2538794026456365e-06, - "loss": 0.7893, - "step": 760 - }, - { - "epoch": 1.2658359293873311, - "grad_norm": 2.2656607881570534, - "learning_rate": 1.2521842780922298e-06, - "loss": 0.9353, - "step": 761 - }, - { - "epoch": 1.267497403946002, - "grad_norm": 2.2731202307003735, - "learning_rate": 1.2504883793318777e-06, - "loss": 0.9712, - "step": 762 - }, - { - "epoch": 1.269158878504673, - "grad_norm": 2.199059936980419, - "learning_rate": 1.2487917115709973e-06, - "loss": 0.8983, - "step": 763 - }, - { - "epoch": 1.2708203530633437, - "grad_norm": 2.3268110070781787, - "learning_rate": 1.2470942800183674e-06, - "loss": 0.881, - "step": 764 - }, - { - "epoch": 1.2724818276220144, - "grad_norm": 2.4916547720738245, - "learning_rate": 1.2453960898851105e-06, - "loss": 1.0081, - "step": 765 - }, - { - "epoch": 1.2741433021806854, - "grad_norm": 2.2962920389351407, - "learning_rate": 1.2436971463846788e-06, - "loss": 0.9745, - "step": 766 - }, - { - "epoch": 1.275804776739356, - "grad_norm": 2.2101054126420863, - "learning_rate": 1.2419974547328364e-06, - "loss": 0.9563, - "step": 767 - }, - { - "epoch": 1.277466251298027, - "grad_norm": 2.2891187380085913, - "learning_rate": 1.2402970201476457e-06, - "loss": 0.9303, - "step": 768 - }, - { - "epoch": 1.2791277258566978, - "grad_norm": 2.1360488073197024, - "learning_rate": 1.2385958478494484e-06, - "loss": 0.8754, - "step": 769 - }, - { - "epoch": 1.2807892004153687, - "grad_norm": 2.19618235102571, - "learning_rate": 1.236893943060852e-06, - "loss": 0.9035, - "step": 770 - }, - { - "epoch": 1.2824506749740394, - "grad_norm": 2.134112006128728, - "learning_rate": 1.235191311006712e-06, - "loss": 0.9439, - "step": 771 - }, - { - "epoch": 1.2841121495327104, - "grad_norm": 3.1612461680428816, - "learning_rate": 1.2334879569141172e-06, - "loss": 0.8783, - "step": 772 - }, - { - "epoch": 1.285773624091381, - "grad_norm": 1.9962401932268041, - "learning_rate": 1.231783886012373e-06, - "loss": 0.9263, - "step": 773 - }, - { - "epoch": 1.287435098650052, - "grad_norm": 2.1827109534797695, - "learning_rate": 1.230079103532985e-06, - "loss": 0.9417, - "step": 774 - }, - { - "epoch": 1.2890965732087227, - "grad_norm": 2.7468270854753336, - "learning_rate": 1.228373614709644e-06, - "loss": 1.004, - "step": 775 - }, - { - "epoch": 1.2907580477673934, - "grad_norm": 2.1747453190202646, - "learning_rate": 1.2266674247782086e-06, - "loss": 0.9531, - "step": 776 - }, - { - "epoch": 1.2924195223260644, - "grad_norm": 2.085633930937514, - "learning_rate": 1.2249605389766895e-06, - "loss": 1.0289, - "step": 777 - }, - { - "epoch": 1.2940809968847353, - "grad_norm": 2.2021065356232317, - "learning_rate": 1.223252962545235e-06, - "loss": 0.8765, - "step": 778 - }, - { - "epoch": 1.295742471443406, - "grad_norm": 2.1829312571834896, - "learning_rate": 1.2215447007261133e-06, - "loss": 1.0022, - "step": 779 - }, - { - "epoch": 1.2974039460020768, - "grad_norm": 2.1283332483638286, - "learning_rate": 1.2198357587636956e-06, - "loss": 0.9697, - "step": 780 - }, - { - "epoch": 1.2990654205607477, - "grad_norm": 2.525909866186781, - "learning_rate": 1.2181261419044426e-06, - "loss": 0.9726, - "step": 781 - }, - { - "epoch": 1.3007268951194184, - "grad_norm": 2.1648814032733186, - "learning_rate": 1.2164158553968855e-06, - "loss": 1.0231, - "step": 782 - }, - { - "epoch": 1.3023883696780894, - "grad_norm": 2.2334029411110556, - "learning_rate": 1.2147049044916128e-06, - "loss": 1.1608, - "step": 783 - }, - { - "epoch": 1.30404984423676, - "grad_norm": 2.3547357866951093, - "learning_rate": 1.2129932944412518e-06, - "loss": 0.9692, - "step": 784 - }, - { - "epoch": 1.305711318795431, - "grad_norm": 2.2065719120739726, - "learning_rate": 1.2112810305004535e-06, - "loss": 0.8479, - "step": 785 - }, - { - "epoch": 1.3073727933541017, - "grad_norm": 2.125915368197623, - "learning_rate": 1.2095681179258764e-06, - "loss": 0.9621, - "step": 786 - }, - { - "epoch": 1.3090342679127727, - "grad_norm": 2.645809087395215, - "learning_rate": 1.2078545619761702e-06, - "loss": 1.0085, - "step": 787 - }, - { - "epoch": 1.3106957424714434, - "grad_norm": 2.250819574066223, - "learning_rate": 1.2061403679119601e-06, - "loss": 0.9068, - "step": 788 - }, - { - "epoch": 1.3123572170301143, - "grad_norm": 2.21749937163358, - "learning_rate": 1.2044255409958303e-06, - "loss": 0.9283, - "step": 789 - }, - { - "epoch": 1.314018691588785, - "grad_norm": 2.2743487946390992, - "learning_rate": 1.2027100864923075e-06, - "loss": 0.9947, - "step": 790 - }, - { - "epoch": 1.3156801661474558, - "grad_norm": 2.6923629420708646, - "learning_rate": 1.200994009667845e-06, - "loss": 0.9471, - "step": 791 - }, - { - "epoch": 1.3173416407061267, - "grad_norm": 2.3195724880810102, - "learning_rate": 1.1992773157908072e-06, - "loss": 0.9601, - "step": 792 - }, - { - "epoch": 1.3190031152647976, - "grad_norm": 2.0807605681167405, - "learning_rate": 1.1975600101314525e-06, - "loss": 0.8885, - "step": 793 - }, - { - "epoch": 1.3206645898234683, - "grad_norm": 2.0900202951082147, - "learning_rate": 1.1958420979619175e-06, - "loss": 1.0104, - "step": 794 - }, - { - "epoch": 1.322326064382139, - "grad_norm": 2.183167054296833, - "learning_rate": 1.1941235845562005e-06, - "loss": 0.9351, - "step": 795 - }, - { - "epoch": 1.32398753894081, - "grad_norm": 2.094498837050807, - "learning_rate": 1.1924044751901464e-06, - "loss": 0.995, - "step": 796 - }, - { - "epoch": 1.3256490134994807, - "grad_norm": 2.1534518031095957, - "learning_rate": 1.1906847751414291e-06, - "loss": 1.0239, - "step": 797 - }, - { - "epoch": 1.3273104880581517, - "grad_norm": 2.4401111022025557, - "learning_rate": 1.188964489689536e-06, - "loss": 1.0308, - "step": 798 - }, - { - "epoch": 1.3289719626168224, - "grad_norm": 2.6081491691522873, - "learning_rate": 1.1872436241157518e-06, - "loss": 1.1202, - "step": 799 - }, - { - "epoch": 1.3306334371754933, - "grad_norm": 2.3979641911118383, - "learning_rate": 1.1855221837031418e-06, - "loss": 0.9712, - "step": 800 - }, - { - "epoch": 1.332294911734164, - "grad_norm": 2.3288896242545563, - "learning_rate": 1.1838001737365363e-06, - "loss": 0.9857, - "step": 801 - }, - { - "epoch": 1.333956386292835, - "grad_norm": 2.2743604591828714, - "learning_rate": 1.1820775995025146e-06, - "loss": 1.0457, - "step": 802 - }, - { - "epoch": 1.3356178608515057, - "grad_norm": 2.2419016690025106, - "learning_rate": 1.1803544662893875e-06, - "loss": 1.0037, - "step": 803 - }, - { - "epoch": 1.3372793354101766, - "grad_norm": 2.097257183297578, - "learning_rate": 1.1786307793871823e-06, - "loss": 1.0137, - "step": 804 - }, - { - "epoch": 1.3389408099688473, - "grad_norm": 2.0525680314921115, - "learning_rate": 1.1769065440876263e-06, - "loss": 0.941, - "step": 805 - }, - { - "epoch": 1.340602284527518, - "grad_norm": 2.1484754300130797, - "learning_rate": 1.1751817656841297e-06, - "loss": 0.8931, - "step": 806 - }, - { - "epoch": 1.342263759086189, - "grad_norm": 2.049973872619327, - "learning_rate": 1.1734564494717708e-06, - "loss": 0.9391, - "step": 807 - }, - { - "epoch": 1.34392523364486, - "grad_norm": 2.1411969844449117, - "learning_rate": 1.171730600747279e-06, - "loss": 0.8776, - "step": 808 - }, - { - "epoch": 1.3455867082035307, - "grad_norm": 2.333385888491211, - "learning_rate": 1.1700042248090174e-06, - "loss": 0.8865, - "step": 809 - }, - { - "epoch": 1.3472481827622014, - "grad_norm": 2.172814651117749, - "learning_rate": 1.1682773269569692e-06, - "loss": 1.1343, - "step": 810 - }, - { - "epoch": 1.3489096573208723, - "grad_norm": 2.106719631129538, - "learning_rate": 1.1665499124927182e-06, - "loss": 0.9706, - "step": 811 - }, - { - "epoch": 1.350571131879543, - "grad_norm": 2.2297462633284812, - "learning_rate": 1.164821986719436e-06, - "loss": 0.9913, - "step": 812 - }, - { - "epoch": 1.352232606438214, - "grad_norm": 2.617400755249897, - "learning_rate": 1.1630935549418626e-06, - "loss": 0.9445, - "step": 813 - }, - { - "epoch": 1.3538940809968847, - "grad_norm": 2.1827702013845895, - "learning_rate": 1.161364622466292e-06, - "loss": 0.9375, - "step": 814 - }, - { - "epoch": 1.3555555555555556, - "grad_norm": 2.2016299775994876, - "learning_rate": 1.159635194600555e-06, - "loss": 0.9031, - "step": 815 - }, - { - "epoch": 1.3572170301142263, - "grad_norm": 2.070277398005119, - "learning_rate": 1.157905276654004e-06, - "loss": 0.9009, - "step": 816 - }, - { - "epoch": 1.358878504672897, - "grad_norm": 2.141179928339341, - "learning_rate": 1.1561748739374944e-06, - "loss": 0.9099, - "step": 817 - }, - { - "epoch": 1.360539979231568, - "grad_norm": 2.007914206117487, - "learning_rate": 1.1544439917633716e-06, - "loss": 0.892, - "step": 818 - }, - { - "epoch": 1.362201453790239, - "grad_norm": 2.0648321350070526, - "learning_rate": 1.1527126354454525e-06, - "loss": 0.958, - "step": 819 - }, - { - "epoch": 1.3638629283489097, - "grad_norm": 2.0018244208752565, - "learning_rate": 1.1509808102990085e-06, - "loss": 0.9431, - "step": 820 - }, - { - "epoch": 1.3655244029075804, - "grad_norm": 2.361320004069871, - "learning_rate": 1.1492485216407513e-06, - "loss": 1.0479, - "step": 821 - }, - { - "epoch": 1.3671858774662513, - "grad_norm": 2.6806488863596862, - "learning_rate": 1.1475157747888158e-06, - "loss": 0.9181, - "step": 822 - }, - { - "epoch": 1.3688473520249222, - "grad_norm": 2.1942518382547713, - "learning_rate": 1.145782575062743e-06, - "loss": 0.9427, - "step": 823 - }, - { - "epoch": 1.370508826583593, - "grad_norm": 2.122601010531213, - "learning_rate": 1.1440489277834645e-06, - "loss": 1.0671, - "step": 824 - }, - { - "epoch": 1.3721703011422637, - "grad_norm": 1.9965619962296717, - "learning_rate": 1.1423148382732853e-06, - "loss": 0.9456, - "step": 825 - }, - { - "epoch": 1.3738317757009346, - "grad_norm": 2.318013669509405, - "learning_rate": 1.1405803118558687e-06, - "loss": 0.9946, - "step": 826 - }, - { - "epoch": 1.3754932502596053, - "grad_norm": 2.1436011385350056, - "learning_rate": 1.1388453538562195e-06, - "loss": 0.9548, - "step": 827 - }, - { - "epoch": 1.3771547248182763, - "grad_norm": 2.1480524834330588, - "learning_rate": 1.137109969600667e-06, - "loss": 1.0541, - "step": 828 - }, - { - "epoch": 1.378816199376947, - "grad_norm": 2.417891031584686, - "learning_rate": 1.1353741644168487e-06, - "loss": 0.9857, - "step": 829 - }, - { - "epoch": 1.380477673935618, - "grad_norm": 2.1577692143799516, - "learning_rate": 1.1336379436336953e-06, - "loss": 1.0296, - "step": 830 - }, - { - "epoch": 1.3821391484942886, - "grad_norm": 2.3755322611919807, - "learning_rate": 1.131901312581413e-06, - "loss": 1.0684, - "step": 831 - }, - { - "epoch": 1.3838006230529594, - "grad_norm": 2.193724004809641, - "learning_rate": 1.1301642765914672e-06, - "loss": 0.995, - "step": 832 - }, - { - "epoch": 1.3854620976116303, - "grad_norm": 2.084892517254851, - "learning_rate": 1.1284268409965671e-06, - "loss": 1.0584, - "step": 833 - }, - { - "epoch": 1.3871235721703012, - "grad_norm": 2.107802237296406, - "learning_rate": 1.1266890111306483e-06, - "loss": 0.9933, - "step": 834 - }, - { - "epoch": 1.388785046728972, - "grad_norm": 2.2944459376176116, - "learning_rate": 1.1249507923288561e-06, - "loss": 0.8813, - "step": 835 - }, - { - "epoch": 1.3904465212876427, - "grad_norm": 2.4275620341058697, - "learning_rate": 1.1232121899275313e-06, - "loss": 1.0109, - "step": 836 - }, - { - "epoch": 1.3921079958463136, - "grad_norm": 2.213400646410023, - "learning_rate": 1.1214732092641914e-06, - "loss": 1.0679, - "step": 837 - }, - { - "epoch": 1.3937694704049846, - "grad_norm": 2.1158008518205724, - "learning_rate": 1.1197338556775155e-06, - "loss": 0.945, - "step": 838 - }, - { - "epoch": 1.3954309449636553, - "grad_norm": 2.2576033887254674, - "learning_rate": 1.1179941345073277e-06, - "loss": 1.1281, - "step": 839 - }, - { - "epoch": 1.397092419522326, - "grad_norm": 2.174956086888558, - "learning_rate": 1.1162540510945798e-06, - "loss": 0.9392, - "step": 840 - }, - { - "epoch": 1.398753894080997, - "grad_norm": 2.2102299571149473, - "learning_rate": 1.1145136107813361e-06, - "loss": 0.8779, - "step": 841 - }, - { - "epoch": 1.4004153686396676, - "grad_norm": 2.293662364545717, - "learning_rate": 1.1127728189107574e-06, - "loss": 0.8608, - "step": 842 - }, - { - "epoch": 1.4020768431983386, - "grad_norm": 2.2613684494376756, - "learning_rate": 1.111031680827083e-06, - "loss": 0.8828, - "step": 843 - }, - { - "epoch": 1.4037383177570093, - "grad_norm": 2.1319924720800696, - "learning_rate": 1.1092902018756148e-06, - "loss": 1.0049, - "step": 844 - }, - { - "epoch": 1.4053997923156802, - "grad_norm": 2.28021887844631, - "learning_rate": 1.1075483874027018e-06, - "loss": 0.9074, - "step": 845 - }, - { - "epoch": 1.407061266874351, - "grad_norm": 2.1484668932462445, - "learning_rate": 1.1058062427557228e-06, - "loss": 1.0847, - "step": 846 - }, - { - "epoch": 1.4087227414330217, - "grad_norm": 2.2205863989831416, - "learning_rate": 1.10406377328307e-06, - "loss": 0.9719, - "step": 847 - }, - { - "epoch": 1.4103842159916926, - "grad_norm": 2.1508101547003564, - "learning_rate": 1.1023209843341332e-06, - "loss": 1.0049, - "step": 848 - }, - { - "epoch": 1.4120456905503636, - "grad_norm": 2.3302734390720397, - "learning_rate": 1.1005778812592832e-06, - "loss": 1.1868, - "step": 849 - }, - { - "epoch": 1.4137071651090343, - "grad_norm": 2.1781701265290327, - "learning_rate": 1.0988344694098544e-06, - "loss": 0.8906, - "step": 850 - }, - { - "epoch": 1.415368639667705, - "grad_norm": 2.226671789203206, - "learning_rate": 1.0970907541381294e-06, - "loss": 0.929, - "step": 851 - }, - { - "epoch": 1.417030114226376, - "grad_norm": 2.5515388989279866, - "learning_rate": 1.095346740797323e-06, - "loss": 0.8837, - "step": 852 - }, - { - "epoch": 1.4186915887850469, - "grad_norm": 2.236679949130665, - "learning_rate": 1.0936024347415642e-06, - "loss": 0.9804, - "step": 853 - }, - { - "epoch": 1.4203530633437176, - "grad_norm": 2.9612492685412852, - "learning_rate": 1.091857841325881e-06, - "loss": 0.9525, - "step": 854 - }, - { - "epoch": 1.4220145379023883, - "grad_norm": 2.5666504315216137, - "learning_rate": 1.0901129659061837e-06, - "loss": 0.8627, - "step": 855 - }, - { - "epoch": 1.4236760124610592, - "grad_norm": 2.4015395133256106, - "learning_rate": 1.0883678138392475e-06, - "loss": 0.9089, - "step": 856 - }, - { - "epoch": 1.42533748701973, - "grad_norm": 2.3067956889994097, - "learning_rate": 1.0866223904826989e-06, - "loss": 0.9487, - "step": 857 - }, - { - "epoch": 1.426998961578401, - "grad_norm": 2.0391869701955647, - "learning_rate": 1.084876701194995e-06, - "loss": 1.0349, - "step": 858 - }, - { - "epoch": 1.4286604361370716, - "grad_norm": 2.1857273166825095, - "learning_rate": 1.0831307513354112e-06, - "loss": 0.9983, - "step": 859 - }, - { - "epoch": 1.4303219106957425, - "grad_norm": 2.2108513988117107, - "learning_rate": 1.0813845462640206e-06, - "loss": 0.9867, - "step": 860 - }, - { - "epoch": 1.4319833852544133, - "grad_norm": 2.4469400036513838, - "learning_rate": 1.0796380913416823e-06, - "loss": 0.9024, - "step": 861 - }, - { - "epoch": 1.433644859813084, - "grad_norm": 2.1429468615368905, - "learning_rate": 1.0778913919300209e-06, - "loss": 0.8963, - "step": 862 - }, - { - "epoch": 1.435306334371755, - "grad_norm": 2.176033522508349, - "learning_rate": 1.0761444533914124e-06, - "loss": 0.9561, - "step": 863 - }, - { - "epoch": 1.4369678089304259, - "grad_norm": 2.0784290252253137, - "learning_rate": 1.0743972810889654e-06, - "loss": 0.906, - "step": 864 - }, - { - "epoch": 1.4386292834890966, - "grad_norm": 2.2109329665454087, - "learning_rate": 1.0726498803865088e-06, - "loss": 0.9316, - "step": 865 - }, - { - "epoch": 1.4402907580477673, - "grad_norm": 2.160142006014489, - "learning_rate": 1.0709022566485697e-06, - "loss": 1.0291, - "step": 866 - }, - { - "epoch": 1.4419522326064382, - "grad_norm": 2.236888922467273, - "learning_rate": 1.069154415240362e-06, - "loss": 0.9812, - "step": 867 - }, - { - "epoch": 1.4436137071651092, - "grad_norm": 2.4886237263035276, - "learning_rate": 1.067406361527768e-06, - "loss": 1.0427, - "step": 868 - }, - { - "epoch": 1.44527518172378, - "grad_norm": 2.2522311274987823, - "learning_rate": 1.0656581008773197e-06, - "loss": 0.8864, - "step": 869 - }, - { - "epoch": 1.4469366562824506, - "grad_norm": 2.126057360159879, - "learning_rate": 1.0639096386561864e-06, - "loss": 1.0294, - "step": 870 - }, - { - "epoch": 1.4485981308411215, - "grad_norm": 2.257463390997248, - "learning_rate": 1.0621609802321553e-06, - "loss": 0.9633, - "step": 871 - }, - { - "epoch": 1.4502596053997923, - "grad_norm": 2.26321915407214, - "learning_rate": 1.0604121309736163e-06, - "loss": 0.8698, - "step": 872 - }, - { - "epoch": 1.4519210799584632, - "grad_norm": 2.061806930610862, - "learning_rate": 1.058663096249545e-06, - "loss": 1.0377, - "step": 873 - }, - { - "epoch": 1.453582554517134, - "grad_norm": 2.1295799083708147, - "learning_rate": 1.0569138814294862e-06, - "loss": 0.9534, - "step": 874 - }, - { - "epoch": 1.4552440290758049, - "grad_norm": 2.3106775542374294, - "learning_rate": 1.055164491883538e-06, - "loss": 0.9403, - "step": 875 - }, - { - "epoch": 1.4569055036344756, - "grad_norm": 2.2302178209419132, - "learning_rate": 1.0534149329823347e-06, - "loss": 0.9301, - "step": 876 - }, - { - "epoch": 1.4585669781931463, - "grad_norm": 2.2949573702055117, - "learning_rate": 1.0516652100970306e-06, - "loss": 0.9528, - "step": 877 - }, - { - "epoch": 1.4602284527518172, - "grad_norm": 2.10604631236836, - "learning_rate": 1.0499153285992832e-06, - "loss": 0.9069, - "step": 878 - }, - { - "epoch": 1.4618899273104882, - "grad_norm": 2.1660045153149934, - "learning_rate": 1.0481652938612372e-06, - "loss": 0.9408, - "step": 879 - }, - { - "epoch": 1.4635514018691589, - "grad_norm": 2.2748273606386995, - "learning_rate": 1.0464151112555076e-06, - "loss": 1.0554, - "step": 880 - }, - { - "epoch": 1.4652128764278296, - "grad_norm": 2.2711524480515406, - "learning_rate": 1.0446647861551632e-06, - "loss": 1.0468, - "step": 881 - }, - { - "epoch": 1.4668743509865005, - "grad_norm": 2.2946241383411987, - "learning_rate": 1.042914323933711e-06, - "loss": 0.8936, - "step": 882 - }, - { - "epoch": 1.4685358255451713, - "grad_norm": 2.390089218019007, - "learning_rate": 1.0411637299650781e-06, - "loss": 1.0086, - "step": 883 - }, - { - "epoch": 1.4701973001038422, - "grad_norm": 2.340221690538989, - "learning_rate": 1.0394130096235965e-06, - "loss": 1.0351, - "step": 884 - }, - { - "epoch": 1.471858774662513, - "grad_norm": 2.297583872743856, - "learning_rate": 1.0376621682839856e-06, - "loss": 0.9497, - "step": 885 - }, - { - "epoch": 1.4735202492211839, - "grad_norm": 2.305498668474036, - "learning_rate": 1.0359112113213374e-06, - "loss": 0.9291, - "step": 886 - }, - { - "epoch": 1.4751817237798546, - "grad_norm": 2.182291435160706, - "learning_rate": 1.0341601441110981e-06, - "loss": 0.9932, - "step": 887 - }, - { - "epoch": 1.4768431983385255, - "grad_norm": 2.0985924867543924, - "learning_rate": 1.0324089720290522e-06, - "loss": 0.9702, - "step": 888 - }, - { - "epoch": 1.4785046728971962, - "grad_norm": 2.2916738661032863, - "learning_rate": 1.0306577004513064e-06, - "loss": 1.0412, - "step": 889 - }, - { - "epoch": 1.4801661474558672, - "grad_norm": 2.3135550693056115, - "learning_rate": 1.0289063347542726e-06, - "loss": 0.8467, - "step": 890 - }, - { - "epoch": 1.4818276220145379, - "grad_norm": 2.3188283888371393, - "learning_rate": 1.0271548803146525e-06, - "loss": 0.9271, - "step": 891 - }, - { - "epoch": 1.4834890965732086, - "grad_norm": 2.2139777839358845, - "learning_rate": 1.0254033425094196e-06, - "loss": 0.9119, - "step": 892 - }, - { - "epoch": 1.4851505711318795, - "grad_norm": 2.3212272419519713, - "learning_rate": 1.0236517267158026e-06, - "loss": 1.0185, - "step": 893 - }, - { - "epoch": 1.4868120456905505, - "grad_norm": 2.0589993974559757, - "learning_rate": 1.0219000383112713e-06, - "loss": 1.022, - "step": 894 - }, - { - "epoch": 1.4884735202492212, - "grad_norm": 2.5364209766355015, - "learning_rate": 1.020148282673517e-06, - "loss": 0.9629, - "step": 895 - }, - { - "epoch": 1.490134994807892, - "grad_norm": 2.1254602155332982, - "learning_rate": 1.0183964651804382e-06, - "loss": 0.9765, - "step": 896 - }, - { - "epoch": 1.4917964693665628, - "grad_norm": 2.2144306649179, - "learning_rate": 1.0166445912101228e-06, - "loss": 0.9865, - "step": 897 - }, - { - "epoch": 1.4934579439252336, - "grad_norm": 2.6976218997101173, - "learning_rate": 1.0148926661408327e-06, - "loss": 1.1091, - "step": 898 - }, - { - "epoch": 1.4951194184839045, - "grad_norm": 2.1447116841130347, - "learning_rate": 1.0131406953509855e-06, - "loss": 0.9854, - "step": 899 - }, - { - "epoch": 1.4967808930425752, - "grad_norm": 2.313826573818675, - "learning_rate": 1.0113886842191408e-06, - "loss": 1.0696, - "step": 900 - }, - { - "epoch": 1.4984423676012462, - "grad_norm": 2.398689146513669, - "learning_rate": 1.0096366381239806e-06, - "loss": 0.9913, - "step": 901 - }, - { - "epoch": 1.5001038421599169, - "grad_norm": 2.01368084597207, - "learning_rate": 1.0078845624442953e-06, - "loss": 0.8734, - "step": 902 - }, - { - "epoch": 1.5017653167185876, - "grad_norm": 2.162541083177183, - "learning_rate": 1.0061324625589655e-06, - "loss": 0.8432, - "step": 903 - }, - { - "epoch": 1.5034267912772585, - "grad_norm": 2.066277760482693, - "learning_rate": 1.004380343846946e-06, - "loss": 0.9469, - "step": 904 - }, - { - "epoch": 1.5050882658359295, - "grad_norm": 2.291540588940334, - "learning_rate": 1.0026282116872498e-06, - "loss": 0.8913, - "step": 905 - }, - { - "epoch": 1.5067497403946002, - "grad_norm": 2.3475144052615833, - "learning_rate": 1.000876071458931e-06, - "loss": 0.8889, - "step": 906 - }, - { - "epoch": 1.5067497403946002, - "eval_loss": 1.3885624408721924, - "eval_runtime": 24.6265, - "eval_samples_per_second": 0.447, - "eval_steps_per_second": 0.122, - "step": 906 - }, - { - "epoch": 1.508411214953271, - "grad_norm": 3.065303881065278, - "learning_rate": 9.99123928541069e-07, - "loss": 0.9061, - "step": 907 - }, - { - "epoch": 1.5100726895119418, - "grad_norm": 2.191485923954429, - "learning_rate": 9.973717883127503e-07, - "loss": 0.9699, - "step": 908 - }, - { - "epoch": 1.5117341640706128, - "grad_norm": 2.3278723421202505, - "learning_rate": 9.95619656153054e-07, - "loss": 1.0169, - "step": 909 - }, - { - "epoch": 1.5133956386292835, - "grad_norm": 2.3678043287491586, - "learning_rate": 9.938675374410346e-07, - "loss": 0.9757, - "step": 910 - }, - { - "epoch": 1.5150571131879542, - "grad_norm": 2.1901568875128836, - "learning_rate": 9.921154375557046e-07, - "loss": 1.0352, - "step": 911 - }, - { - "epoch": 1.5167185877466252, - "grad_norm": 2.30893338243549, - "learning_rate": 9.903633618760193e-07, - "loss": 0.8631, - "step": 912 - }, - { - "epoch": 1.518380062305296, - "grad_norm": 2.2873579587086366, - "learning_rate": 9.886113157808594e-07, - "loss": 0.8675, - "step": 913 - }, - { - "epoch": 1.5200415368639668, - "grad_norm": 2.085982086942701, - "learning_rate": 9.868593046490144e-07, - "loss": 0.947, - "step": 914 - }, - { - "epoch": 1.5217030114226375, - "grad_norm": 2.060117879856327, - "learning_rate": 9.851073338591675e-07, - "loss": 1.0063, - "step": 915 - }, - { - "epoch": 1.5233644859813085, - "grad_norm": 2.43994470111981, - "learning_rate": 9.833554087898773e-07, - "loss": 0.9692, - "step": 916 - }, - { - "epoch": 1.5250259605399792, - "grad_norm": 2.2294605522286948, - "learning_rate": 9.81603534819562e-07, - "loss": 0.9935, - "step": 917 - }, - { - "epoch": 1.52668743509865, - "grad_norm": 2.1907032716227883, - "learning_rate": 9.798517173264831e-07, - "loss": 0.9507, - "step": 918 - }, - { - "epoch": 1.5283489096573208, - "grad_norm": 2.1045414815637504, - "learning_rate": 9.780999616887288e-07, - "loss": 0.9059, - "step": 919 - }, - { - "epoch": 1.5300103842159918, - "grad_norm": 2.3012223647608323, - "learning_rate": 9.763482732841975e-07, - "loss": 1.0022, - "step": 920 - }, - { - "epoch": 1.5316718587746625, - "grad_norm": 2.272854650284857, - "learning_rate": 9.74596657490581e-07, - "loss": 1.1475, - "step": 921 - }, - { - "epoch": 1.5333333333333332, - "grad_norm": 2.5844520190272853, - "learning_rate": 9.728451196853476e-07, - "loss": 0.9708, - "step": 922 - }, - { - "epoch": 1.5349948078920042, - "grad_norm": 2.1870847904085626, - "learning_rate": 9.710936652457275e-07, - "loss": 1.0149, - "step": 923 - }, - { - "epoch": 1.536656282450675, - "grad_norm": 2.167131744822391, - "learning_rate": 9.693422995486938e-07, - "loss": 0.9241, - "step": 924 - }, - { - "epoch": 1.5383177570093458, - "grad_norm": 2.103729458011538, - "learning_rate": 9.675910279709475e-07, - "loss": 0.8776, - "step": 925 - }, - { - "epoch": 1.5399792315680165, - "grad_norm": 2.2605948836777827, - "learning_rate": 9.658398558889018e-07, - "loss": 0.9223, - "step": 926 - }, - { - "epoch": 1.5416407061266875, - "grad_norm": 2.1973564828092282, - "learning_rate": 9.640887886786623e-07, - "loss": 0.9611, - "step": 927 - }, - { - "epoch": 1.5433021806853584, - "grad_norm": 2.1500165200756998, - "learning_rate": 9.62337831716014e-07, - "loss": 0.9845, - "step": 928 - }, - { - "epoch": 1.5449636552440291, - "grad_norm": 2.1276385579139716, - "learning_rate": 9.605869903764036e-07, - "loss": 0.8422, - "step": 929 - }, - { - "epoch": 1.5466251298026998, - "grad_norm": 2.16875527803873, - "learning_rate": 9.588362700349218e-07, - "loss": 0.8773, - "step": 930 - }, - { - "epoch": 1.5482866043613708, - "grad_norm": 2.175442832799032, - "learning_rate": 9.570856760662888e-07, - "loss": 0.9159, - "step": 931 - }, - { - "epoch": 1.5499480789200415, - "grad_norm": 2.0911533410140652, - "learning_rate": 9.553352138448365e-07, - "loss": 0.9862, - "step": 932 - }, - { - "epoch": 1.5516095534787122, - "grad_norm": 2.4217133148552907, - "learning_rate": 9.535848887444925e-07, - "loss": 1.0448, - "step": 933 - }, - { - "epoch": 1.5532710280373832, - "grad_norm": 2.156898906325239, - "learning_rate": 9.518347061387627e-07, - "loss": 1.0456, - "step": 934 - }, - { - "epoch": 1.554932502596054, - "grad_norm": 2.1174530253263852, - "learning_rate": 9.500846714007168e-07, - "loss": 0.9817, - "step": 935 - }, - { - "epoch": 1.5565939771547248, - "grad_norm": 2.2295767742183337, - "learning_rate": 9.483347899029695e-07, - "loss": 1.1012, - "step": 936 - }, - { - "epoch": 1.5582554517133955, - "grad_norm": 2.045078620592537, - "learning_rate": 9.465850670176653e-07, - "loss": 0.8389, - "step": 937 - }, - { - "epoch": 1.5599169262720665, - "grad_norm": 2.2882936479206313, - "learning_rate": 9.44835508116462e-07, - "loss": 0.9324, - "step": 938 - }, - { - "epoch": 1.5615784008307374, - "grad_norm": 2.274529967263343, - "learning_rate": 9.430861185705137e-07, - "loss": 1.0327, - "step": 939 - }, - { - "epoch": 1.5632398753894081, - "grad_norm": 2.1598121426869565, - "learning_rate": 9.41336903750455e-07, - "loss": 0.9787, - "step": 940 - }, - { - "epoch": 1.5649013499480788, - "grad_norm": 2.2018912955453733, - "learning_rate": 9.395878690263836e-07, - "loss": 0.9555, - "step": 941 - }, - { - "epoch": 1.5665628245067498, - "grad_norm": 2.089375883352287, - "learning_rate": 9.378390197678447e-07, - "loss": 0.9679, - "step": 942 - }, - { - "epoch": 1.5682242990654207, - "grad_norm": 2.203509678408647, - "learning_rate": 9.360903613438137e-07, - "loss": 0.9715, - "step": 943 - }, - { - "epoch": 1.5698857736240914, - "grad_norm": 2.1347102317454367, - "learning_rate": 9.343418991226803e-07, - "loss": 0.7945, - "step": 944 - }, - { - "epoch": 1.5715472481827621, - "grad_norm": 2.378865551913575, - "learning_rate": 9.325936384722321e-07, - "loss": 0.9477, - "step": 945 - }, - { - "epoch": 1.573208722741433, - "grad_norm": 2.4467581458666428, - "learning_rate": 9.308455847596377e-07, - "loss": 1.0578, - "step": 946 - }, - { - "epoch": 1.5748701973001038, - "grad_norm": 2.203361322184244, - "learning_rate": 9.290977433514305e-07, - "loss": 0.884, - "step": 947 - }, - { - "epoch": 1.5765316718587745, - "grad_norm": 2.1727119561544597, - "learning_rate": 9.273501196134914e-07, - "loss": 0.9208, - "step": 948 - }, - { - "epoch": 1.5781931464174455, - "grad_norm": 2.3340178077448837, - "learning_rate": 9.256027189110344e-07, - "loss": 0.9412, - "step": 949 - }, - { - "epoch": 1.5798546209761164, - "grad_norm": 2.2245206192765807, - "learning_rate": 9.23855546608588e-07, - "loss": 0.9677, - "step": 950 - }, - { - "epoch": 1.5815160955347871, - "grad_norm": 2.3713993791607786, - "learning_rate": 9.221086080699792e-07, - "loss": 0.8594, - "step": 951 - }, - { - "epoch": 1.5831775700934578, - "grad_norm": 2.30874801141752, - "learning_rate": 9.203619086583178e-07, - "loss": 1.1003, - "step": 952 - }, - { - "epoch": 1.5848390446521288, - "grad_norm": 2.3037415512544492, - "learning_rate": 9.186154537359794e-07, - "loss": 0.991, - "step": 953 - }, - { - "epoch": 1.5865005192107997, - "grad_norm": 2.360666046738533, - "learning_rate": 9.168692486645893e-07, - "loss": 0.9452, - "step": 954 - }, - { - "epoch": 1.5881619937694704, - "grad_norm": 2.1674212494443736, - "learning_rate": 9.15123298805005e-07, - "loss": 1.0289, - "step": 955 - }, - { - "epoch": 1.5898234683281411, - "grad_norm": 2.2039402749504693, - "learning_rate": 9.133776095173013e-07, - "loss": 0.8989, - "step": 956 - }, - { - "epoch": 1.591484942886812, - "grad_norm": 2.3319779317051994, - "learning_rate": 9.116321861607523e-07, - "loss": 1.0259, - "step": 957 - }, - { - "epoch": 1.593146417445483, - "grad_norm": 2.397971695824514, - "learning_rate": 9.098870340938168e-07, - "loss": 0.9254, - "step": 958 - }, - { - "epoch": 1.5948078920041537, - "grad_norm": 2.1189888494560956, - "learning_rate": 9.081421586741188e-07, - "loss": 0.9747, - "step": 959 - }, - { - "epoch": 1.5964693665628245, - "grad_norm": 2.4628203972860994, - "learning_rate": 9.063975652584357e-07, - "loss": 1.1281, - "step": 960 - }, - { - "epoch": 1.5981308411214954, - "grad_norm": 2.2290027206829452, - "learning_rate": 9.046532592026768e-07, - "loss": 1.0188, - "step": 961 - }, - { - "epoch": 1.599792315680166, - "grad_norm": 2.101837850760124, - "learning_rate": 9.029092458618705e-07, - "loss": 1.0343, - "step": 962 - }, - { - "epoch": 1.6014537902388368, - "grad_norm": 2.1955065161912586, - "learning_rate": 9.011655305901457e-07, - "loss": 0.9229, - "step": 963 - }, - { - "epoch": 1.6031152647975078, - "grad_norm": 2.208465293699187, - "learning_rate": 8.994221187407167e-07, - "loss": 0.9811, - "step": 964 - }, - { - "epoch": 1.6047767393561787, - "grad_norm": 2.189756306702255, - "learning_rate": 8.976790156658665e-07, - "loss": 0.9948, - "step": 965 - }, - { - "epoch": 1.6064382139148494, - "grad_norm": 2.21101125223709, - "learning_rate": 8.959362267169299e-07, - "loss": 0.9206, - "step": 966 - }, - { - "epoch": 1.6080996884735201, - "grad_norm": 2.1552977090716476, - "learning_rate": 8.941937572442773e-07, - "loss": 0.9684, - "step": 967 - }, - { - "epoch": 1.609761163032191, - "grad_norm": 2.0424533004826593, - "learning_rate": 8.924516125972983e-07, - "loss": 0.9515, - "step": 968 - }, - { - "epoch": 1.611422637590862, - "grad_norm": 2.2773490170655437, - "learning_rate": 8.907097981243851e-07, - "loss": 1.0617, - "step": 969 - }, - { - "epoch": 1.6130841121495327, - "grad_norm": 2.219673265679085, - "learning_rate": 8.88968319172917e-07, - "loss": 1.0573, - "step": 970 - }, - { - "epoch": 1.6147455867082035, - "grad_norm": 2.23001657969037, - "learning_rate": 8.872271810892424e-07, - "loss": 1.0252, - "step": 971 - }, - { - "epoch": 1.6164070612668744, - "grad_norm": 2.2328021310208968, - "learning_rate": 8.854863892186639e-07, - "loss": 0.9826, - "step": 972 - }, - { - "epoch": 1.6180685358255453, - "grad_norm": 2.5771699055938475, - "learning_rate": 8.837459489054203e-07, - "loss": 0.9947, - "step": 973 - }, - { - "epoch": 1.619730010384216, - "grad_norm": 2.798440468259025, - "learning_rate": 8.820058654926725e-07, - "loss": 0.9452, - "step": 974 - }, - { - "epoch": 1.6213914849428868, - "grad_norm": 2.1504820505748325, - "learning_rate": 8.802661443224844e-07, - "loss": 1.0738, - "step": 975 - }, - { - "epoch": 1.6230529595015577, - "grad_norm": 2.133788375605966, - "learning_rate": 8.785267907358084e-07, - "loss": 1.0597, - "step": 976 - }, - { - "epoch": 1.6247144340602284, - "grad_norm": 2.1928865638126434, - "learning_rate": 8.767878100724688e-07, - "loss": 0.9914, - "step": 977 - }, - { - "epoch": 1.6263759086188991, - "grad_norm": 2.67304969048529, - "learning_rate": 8.750492076711439e-07, - "loss": 0.9834, - "step": 978 - }, - { - "epoch": 1.62803738317757, - "grad_norm": 2.242441368839289, - "learning_rate": 8.73310988869352e-07, - "loss": 0.9394, - "step": 979 - }, - { - "epoch": 1.629698857736241, - "grad_norm": 2.155656747647904, - "learning_rate": 8.715731590034329e-07, - "loss": 1.083, - "step": 980 - }, - { - "epoch": 1.6313603322949117, - "grad_norm": 2.1738442975670798, - "learning_rate": 8.698357234085327e-07, - "loss": 0.937, - "step": 981 - }, - { - "epoch": 1.6330218068535824, - "grad_norm": 2.1949307066634187, - "learning_rate": 8.680986874185872e-07, - "loss": 0.9225, - "step": 982 - }, - { - "epoch": 1.6346832814122534, - "grad_norm": 2.209794807763499, - "learning_rate": 8.663620563663046e-07, - "loss": 0.9265, - "step": 983 - }, - { - "epoch": 1.6363447559709243, - "grad_norm": 2.27946429282308, - "learning_rate": 8.646258355831513e-07, - "loss": 0.9482, - "step": 984 - }, - { - "epoch": 1.638006230529595, - "grad_norm": 2.4532997949631086, - "learning_rate": 8.628900303993334e-07, - "loss": 1.0124, - "step": 985 - }, - { - "epoch": 1.6396677050882658, - "grad_norm": 2.0841134326683264, - "learning_rate": 8.611546461437808e-07, - "loss": 1.0067, - "step": 986 - }, - { - "epoch": 1.6413291796469367, - "grad_norm": 2.0640910431827395, - "learning_rate": 8.594196881441314e-07, - "loss": 0.7401, - "step": 987 - }, - { - "epoch": 1.6429906542056076, - "grad_norm": 2.442342265002308, - "learning_rate": 8.576851617267149e-07, - "loss": 0.9464, - "step": 988 - }, - { - "epoch": 1.6446521287642781, - "grad_norm": 2.2202450152134725, - "learning_rate": 8.559510722165359e-07, - "loss": 0.9864, - "step": 989 - }, - { - "epoch": 1.646313603322949, - "grad_norm": 2.270663471909469, - "learning_rate": 8.542174249372572e-07, - "loss": 1.1029, - "step": 990 - }, - { - "epoch": 1.64797507788162, - "grad_norm": 2.2174023151217566, - "learning_rate": 8.524842252111843e-07, - "loss": 0.9365, - "step": 991 - }, - { - "epoch": 1.6496365524402907, - "grad_norm": 2.344777176470885, - "learning_rate": 8.507514783592486e-07, - "loss": 1.014, - "step": 992 - }, - { - "epoch": 1.6512980269989614, - "grad_norm": 2.422227525332542, - "learning_rate": 8.490191897009915e-07, - "loss": 0.979, - "step": 993 - }, - { - "epoch": 1.6529595015576324, - "grad_norm": 2.258033290502269, - "learning_rate": 8.472873645545474e-07, - "loss": 1.0524, - "step": 994 - }, - { - "epoch": 1.6546209761163033, - "grad_norm": 2.303002200199465, - "learning_rate": 8.45556008236628e-07, - "loss": 0.933, - "step": 995 - }, - { - "epoch": 1.656282450674974, - "grad_norm": 2.2109316644818495, - "learning_rate": 8.438251260625055e-07, - "loss": 0.9229, - "step": 996 - }, - { - "epoch": 1.6579439252336448, - "grad_norm": 2.2713621854438357, - "learning_rate": 8.420947233459962e-07, - "loss": 0.997, - "step": 997 - }, - { - "epoch": 1.6596053997923157, - "grad_norm": 2.1886965102085965, - "learning_rate": 8.403648053994447e-07, - "loss": 0.9646, - "step": 998 - }, - { - "epoch": 1.6612668743509866, - "grad_norm": 2.3286768659520765, - "learning_rate": 8.386353775337078e-07, - "loss": 1.0782, - "step": 999 - }, - { - "epoch": 1.6629283489096574, - "grad_norm": 2.2131507866973443, - "learning_rate": 8.369064450581372e-07, - "loss": 0.9542, - "step": 1000 - }, - { - "epoch": 1.664589823468328, - "grad_norm": 2.2038565852379777, - "learning_rate": 8.351780132805639e-07, - "loss": 1.009, - "step": 1001 - }, - { - "epoch": 1.666251298026999, - "grad_norm": 2.2824651606443798, - "learning_rate": 8.334500875072817e-07, - "loss": 0.9784, - "step": 1002 - }, - { - "epoch": 1.66791277258567, - "grad_norm": 2.34183081355438, - "learning_rate": 8.317226730430309e-07, - "loss": 1.0175, - "step": 1003 - }, - { - "epoch": 1.6695742471443404, - "grad_norm": 2.1288437772823134, - "learning_rate": 8.299957751909826e-07, - "loss": 0.8847, - "step": 1004 - }, - { - "epoch": 1.6712357217030114, - "grad_norm": 2.4568293153056215, - "learning_rate": 8.282693992527212e-07, - "loss": 0.9493, - "step": 1005 - }, - { - "epoch": 1.6728971962616823, - "grad_norm": 2.320949328340056, - "learning_rate": 8.265435505282292e-07, - "loss": 0.9644, - "step": 1006 - }, - { - "epoch": 1.674558670820353, - "grad_norm": 2.607704387361572, - "learning_rate": 8.248182343158705e-07, - "loss": 0.9348, - "step": 1007 - }, - { - "epoch": 1.6762201453790238, - "grad_norm": 2.5655569259629742, - "learning_rate": 8.230934559123739e-07, - "loss": 1.1464, - "step": 1008 - }, - { - "epoch": 1.6778816199376947, - "grad_norm": 2.2568110375917105, - "learning_rate": 8.213692206128178e-07, - "loss": 1.0082, - "step": 1009 - }, - { - "epoch": 1.6795430944963656, - "grad_norm": 2.329387559326929, - "learning_rate": 8.196455337106126e-07, - "loss": 1.0236, - "step": 1010 - }, - { - "epoch": 1.6812045690550363, - "grad_norm": 2.1931737355029477, - "learning_rate": 8.179224004974856e-07, - "loss": 0.9806, - "step": 1011 - }, - { - "epoch": 1.682866043613707, - "grad_norm": 2.7512987063931384, - "learning_rate": 8.161998262634636e-07, - "loss": 1.0763, - "step": 1012 - }, - { - "epoch": 1.684527518172378, - "grad_norm": 2.3447221138047123, - "learning_rate": 8.144778162968583e-07, - "loss": 0.9142, - "step": 1013 - }, - { - "epoch": 1.686188992731049, - "grad_norm": 2.1872618111159152, - "learning_rate": 8.127563758842483e-07, - "loss": 0.9387, - "step": 1014 - }, - { - "epoch": 1.6878504672897197, - "grad_norm": 2.2470446091718, - "learning_rate": 8.11035510310464e-07, - "loss": 0.9224, - "step": 1015 - }, - { - "epoch": 1.6895119418483904, - "grad_norm": 2.5029397601882417, - "learning_rate": 8.093152248585709e-07, - "loss": 0.9544, - "step": 1016 - }, - { - "epoch": 1.6911734164070613, - "grad_norm": 2.1902389788861547, - "learning_rate": 8.075955248098535e-07, - "loss": 0.9536, - "step": 1017 - }, - { - "epoch": 1.6928348909657323, - "grad_norm": 2.0810637251240225, - "learning_rate": 8.058764154437996e-07, - "loss": 1.0527, - "step": 1018 - }, - { - "epoch": 1.6944963655244027, - "grad_norm": 2.3552600183690227, - "learning_rate": 8.041579020380828e-07, - "loss": 0.9047, - "step": 1019 - }, - { - "epoch": 1.6961578400830737, - "grad_norm": 2.434839618311, - "learning_rate": 8.024399898685478e-07, - "loss": 0.9539, - "step": 1020 - }, - { - "epoch": 1.6978193146417446, - "grad_norm": 2.2927568687484237, - "learning_rate": 8.007226842091929e-07, - "loss": 0.9926, - "step": 1021 - }, - { - "epoch": 1.6994807892004153, - "grad_norm": 2.3228074078887775, - "learning_rate": 7.990059903321552e-07, - "loss": 1.0054, - "step": 1022 - }, - { - "epoch": 1.701142263759086, - "grad_norm": 2.110477821804407, - "learning_rate": 7.972899135076928e-07, - "loss": 0.919, - "step": 1023 - }, - { - "epoch": 1.702803738317757, - "grad_norm": 2.153197204937706, - "learning_rate": 7.9557445900417e-07, - "loss": 0.9917, - "step": 1024 - }, - { - "epoch": 1.704465212876428, - "grad_norm": 2.215029615261696, - "learning_rate": 7.938596320880401e-07, - "loss": 0.7933, - "step": 1025 - }, - { - "epoch": 1.7061266874350987, - "grad_norm": 2.3403292049764666, - "learning_rate": 7.9214543802383e-07, - "loss": 0.9624, - "step": 1026 - }, - { - "epoch": 1.7077881619937694, - "grad_norm": 2.1684163037426183, - "learning_rate": 7.904318820741238e-07, - "loss": 0.9629, - "step": 1027 - }, - { - "epoch": 1.7094496365524403, - "grad_norm": 2.263783250114826, - "learning_rate": 7.887189694995464e-07, - "loss": 0.8771, - "step": 1028 - }, - { - "epoch": 1.7111111111111112, - "grad_norm": 2.3185082936393657, - "learning_rate": 7.87006705558748e-07, - "loss": 0.9736, - "step": 1029 - }, - { - "epoch": 1.712772585669782, - "grad_norm": 2.1436040295655907, - "learning_rate": 7.85295095508387e-07, - "loss": 0.9073, - "step": 1030 - }, - { - "epoch": 1.7144340602284527, - "grad_norm": 2.31861818593136, - "learning_rate": 7.835841446031143e-07, - "loss": 0.8523, - "step": 1031 - }, - { - "epoch": 1.7160955347871236, - "grad_norm": 2.2422376300228946, - "learning_rate": 7.818738580955575e-07, - "loss": 0.9703, - "step": 1032 - }, - { - "epoch": 1.7177570093457943, - "grad_norm": 2.2643194712270307, - "learning_rate": 7.801642412363041e-07, - "loss": 1.0062, - "step": 1033 - }, - { - "epoch": 1.719418483904465, - "grad_norm": 2.237623999857119, - "learning_rate": 7.784552992738866e-07, - "loss": 0.8801, - "step": 1034 - }, - { - "epoch": 1.721079958463136, - "grad_norm": 2.282669507043474, - "learning_rate": 7.767470374547646e-07, - "loss": 1.0902, - "step": 1035 - }, - { - "epoch": 1.722741433021807, - "grad_norm": 2.2928400993422704, - "learning_rate": 7.750394610233105e-07, - "loss": 0.8845, - "step": 1036 - }, - { - "epoch": 1.7244029075804777, - "grad_norm": 2.198326988103989, - "learning_rate": 7.733325752217916e-07, - "loss": 0.9141, - "step": 1037 - }, - { - "epoch": 1.7260643821391484, - "grad_norm": 2.2329846089880716, - "learning_rate": 7.716263852903561e-07, - "loss": 0.8736, - "step": 1038 - }, - { - "epoch": 1.7277258566978193, - "grad_norm": 2.1112717332498416, - "learning_rate": 7.699208964670148e-07, - "loss": 0.9301, - "step": 1039 - }, - { - "epoch": 1.7293873312564902, - "grad_norm": 2.3471201925073943, - "learning_rate": 7.68216113987627e-07, - "loss": 1.1426, - "step": 1040 - }, - { - "epoch": 1.731048805815161, - "grad_norm": 2.2076292469918837, - "learning_rate": 7.665120430858828e-07, - "loss": 1.055, - "step": 1041 - }, - { - "epoch": 1.7327102803738317, - "grad_norm": 2.2641331824435107, - "learning_rate": 7.648086889932878e-07, - "loss": 0.9148, - "step": 1042 - }, - { - "epoch": 1.7343717549325026, - "grad_norm": 2.011124865835, - "learning_rate": 7.631060569391481e-07, - "loss": 0.9435, - "step": 1043 - }, - { - "epoch": 1.7360332294911736, - "grad_norm": 2.5541303942545523, - "learning_rate": 7.614041521505517e-07, - "loss": 1.051, - "step": 1044 - }, - { - "epoch": 1.7376947040498443, - "grad_norm": 2.135958705290272, - "learning_rate": 7.597029798523544e-07, - "loss": 0.8675, - "step": 1045 - }, - { - "epoch": 1.739356178608515, - "grad_norm": 2.1710241061601208, - "learning_rate": 7.580025452671635e-07, - "loss": 0.9826, - "step": 1046 - }, - { - "epoch": 1.741017653167186, - "grad_norm": 2.108624585141848, - "learning_rate": 7.563028536153212e-07, - "loss": 0.9617, - "step": 1047 - }, - { - "epoch": 1.7426791277258566, - "grad_norm": 2.106050127640713, - "learning_rate": 7.546039101148895e-07, - "loss": 1.0632, - "step": 1048 - }, - { - "epoch": 1.7443406022845274, - "grad_norm": 2.187342834226081, - "learning_rate": 7.529057199816325e-07, - "loss": 1.0858, - "step": 1049 - }, - { - "epoch": 1.7460020768431983, - "grad_norm": 2.342073364970068, - "learning_rate": 7.512082884290025e-07, - "loss": 0.9126, - "step": 1050 - }, - { - "epoch": 1.7476635514018692, - "grad_norm": 2.3328737377816613, - "learning_rate": 7.495116206681222e-07, - "loss": 0.9681, - "step": 1051 - }, - { - "epoch": 1.74932502596054, - "grad_norm": 2.149584367750816, - "learning_rate": 7.478157219077702e-07, - "loss": 1.0266, - "step": 1052 - }, - { - "epoch": 1.7509865005192107, - "grad_norm": 2.4000332654502867, - "learning_rate": 7.461205973543635e-07, - "loss": 0.9125, - "step": 1053 - }, - { - "epoch": 1.7526479750778816, - "grad_norm": 2.2222345997916446, - "learning_rate": 7.444262522119427e-07, - "loss": 0.9607, - "step": 1054 - }, - { - "epoch": 1.7543094496365526, - "grad_norm": 2.2094198456747254, - "learning_rate": 7.427326916821557e-07, - "loss": 1.0043, - "step": 1055 - }, - { - "epoch": 1.7559709241952233, - "grad_norm": 2.1757968861860615, - "learning_rate": 7.410399209642409e-07, - "loss": 0.8917, - "step": 1056 - }, - { - "epoch": 1.757632398753894, - "grad_norm": 2.493378250255824, - "learning_rate": 7.393479452550132e-07, - "loss": 1.0598, - "step": 1057 - }, - { - "epoch": 1.757632398753894, - "eval_loss": 1.3888019323349, - "eval_runtime": 24.567, - "eval_samples_per_second": 0.448, - "eval_steps_per_second": 0.122, - "step": 1057 - } - ], - "logging_steps": 1, - "max_steps": 1803, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 151, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.775024226258125e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}