{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9984552576409578, "eval_steps": 500, "global_step": 1698, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017654198389054396, "grad_norm": 11.075085595262578, "learning_rate": 3.92156862745098e-06, "loss": 1.3624, "step": 10 }, { "epoch": 0.03530839677810879, "grad_norm": 0.8268832619967722, "learning_rate": 7.84313725490196e-06, "loss": 1.0988, "step": 20 }, { "epoch": 0.052962595167163194, "grad_norm": 0.6592198895247919, "learning_rate": 1.1764705882352942e-05, "loss": 1.0279, "step": 30 }, { "epoch": 0.07061679355621758, "grad_norm": 0.7345400249936768, "learning_rate": 1.568627450980392e-05, "loss": 1.0346, "step": 40 }, { "epoch": 0.08827099194527198, "grad_norm": 0.6520880282674657, "learning_rate": 1.9607843137254903e-05, "loss": 1.0161, "step": 50 }, { "epoch": 0.10592519033432639, "grad_norm": 0.6267314186354508, "learning_rate": 1.999852647705027e-05, "loss": 0.9827, "step": 60 }, { "epoch": 0.12357938872338078, "grad_norm": 0.5990775699580297, "learning_rate": 1.9993433374984987e-05, "loss": 0.9727, "step": 70 }, { "epoch": 0.14123358711243517, "grad_norm": 0.7072632521466412, "learning_rate": 1.9984704354748582e-05, "loss": 0.9887, "step": 80 }, { "epoch": 0.15888778550148958, "grad_norm": 0.6302011892410732, "learning_rate": 1.9972342592226873e-05, "loss": 0.9639, "step": 90 }, { "epoch": 0.17654198389054396, "grad_norm": 0.6228020691113849, "learning_rate": 1.9956352585008946e-05, "loss": 0.9584, "step": 100 }, { "epoch": 0.19419618227959837, "grad_norm": 0.6537080367025939, "learning_rate": 1.9936740150750825e-05, "loss": 0.9517, "step": 110 }, { "epoch": 0.21185038066865278, "grad_norm": 0.7111863399924561, "learning_rate": 1.9913512425058803e-05, "loss": 0.9408, "step": 120 }, { "epoch": 0.22950457905770716, "grad_norm": 0.6710168558236342, "learning_rate": 1.9886677858893303e-05, "loss": 0.9374, "step": 130 }, { "epoch": 0.24715877744676157, "grad_norm": 0.6547246997110331, "learning_rate": 1.9856246215494147e-05, "loss": 0.9353, "step": 140 }, { "epoch": 0.264812975835816, "grad_norm": 0.5595831632061861, "learning_rate": 1.982222856682841e-05, "loss": 0.9508, "step": 150 }, { "epoch": 0.28246717422487033, "grad_norm": 0.6617569180053982, "learning_rate": 1.9784637289562067e-05, "loss": 0.9385, "step": 160 }, { "epoch": 0.30012137261392474, "grad_norm": 0.7260939928287287, "learning_rate": 1.9743486060557015e-05, "loss": 0.9226, "step": 170 }, { "epoch": 0.31777557100297915, "grad_norm": 0.6291820154599896, "learning_rate": 1.9698789851894986e-05, "loss": 0.9262, "step": 180 }, { "epoch": 0.33542976939203356, "grad_norm": 0.5974647680183809, "learning_rate": 1.9650564925430257e-05, "loss": 0.9207, "step": 190 }, { "epoch": 0.3530839677810879, "grad_norm": 0.5295497383864004, "learning_rate": 1.9598828826873085e-05, "loss": 0.9259, "step": 200 }, { "epoch": 0.3707381661701423, "grad_norm": 0.5999369621481067, "learning_rate": 1.9543600379406027e-05, "loss": 0.931, "step": 210 }, { "epoch": 0.38839236455919673, "grad_norm": 0.7118598829666988, "learning_rate": 1.9484899676835504e-05, "loss": 0.9132, "step": 220 }, { "epoch": 0.40604656294825114, "grad_norm": 0.5244866233662618, "learning_rate": 1.9422748076281054e-05, "loss": 0.9165, "step": 230 }, { "epoch": 0.42370076133730555, "grad_norm": 0.5981442443509013, "learning_rate": 1.9357168190404937e-05, "loss": 0.9145, "step": 240 }, { "epoch": 0.4413549597263599, "grad_norm": 0.5814732591931184, "learning_rate": 1.9288183879184986e-05, "loss": 0.9108, "step": 250 }, { "epoch": 0.4590091581154143, "grad_norm": 0.6328453186407658, "learning_rate": 1.9215820241233585e-05, "loss": 0.9256, "step": 260 }, { "epoch": 0.4766633565044687, "grad_norm": 0.6253638836302401, "learning_rate": 1.9140103604666035e-05, "loss": 0.903, "step": 270 }, { "epoch": 0.49431755489352314, "grad_norm": 0.6443153938386799, "learning_rate": 1.9061061517521575e-05, "loss": 0.9085, "step": 280 }, { "epoch": 0.5119717532825775, "grad_norm": 0.5794297240032266, "learning_rate": 1.897872273774056e-05, "loss": 0.9142, "step": 290 }, { "epoch": 0.529625951671632, "grad_norm": 0.5506943177514773, "learning_rate": 1.8893117222701435e-05, "loss": 0.8941, "step": 300 }, { "epoch": 0.5472801500606863, "grad_norm": 0.7158873616368372, "learning_rate": 1.8804276118321328e-05, "loss": 0.912, "step": 310 }, { "epoch": 0.5649343484497407, "grad_norm": 0.6761743351092112, "learning_rate": 1.8712231747724194e-05, "loss": 0.909, "step": 320 }, { "epoch": 0.5825885468387951, "grad_norm": 0.5834711748627983, "learning_rate": 1.861701759948068e-05, "loss": 0.8963, "step": 330 }, { "epoch": 0.6002427452278495, "grad_norm": 0.5603155829035662, "learning_rate": 1.8518668315423962e-05, "loss": 0.8995, "step": 340 }, { "epoch": 0.617896943616904, "grad_norm": 0.522797804575778, "learning_rate": 1.8417219678045953e-05, "loss": 0.8962, "step": 350 }, { "epoch": 0.6355511420059583, "grad_norm": 0.6835351293213303, "learning_rate": 1.831270859747857e-05, "loss": 0.8916, "step": 360 }, { "epoch": 0.6532053403950127, "grad_norm": 0.5938476091687838, "learning_rate": 1.8205173098064656e-05, "loss": 0.8726, "step": 370 }, { "epoch": 0.6708595387840671, "grad_norm": 0.634802092406171, "learning_rate": 1.8094652304523584e-05, "loss": 0.8841, "step": 380 }, { "epoch": 0.6885137371731215, "grad_norm": 0.6742253195098137, "learning_rate": 1.7981186427716478e-05, "loss": 0.8817, "step": 390 }, { "epoch": 0.7061679355621758, "grad_norm": 0.5772838179810406, "learning_rate": 1.7864816750016246e-05, "loss": 0.8803, "step": 400 }, { "epoch": 0.7238221339512303, "grad_norm": 0.6030744898202103, "learning_rate": 1.7745585610287812e-05, "loss": 0.876, "step": 410 }, { "epoch": 0.7414763323402846, "grad_norm": 0.5955540196255147, "learning_rate": 1.7623536388483902e-05, "loss": 0.879, "step": 420 }, { "epoch": 0.7591305307293391, "grad_norm": 0.6933702635166431, "learning_rate": 1.7498713489862133e-05, "loss": 0.8917, "step": 430 }, { "epoch": 0.7767847291183935, "grad_norm": 0.5817510021566383, "learning_rate": 1.737116232882895e-05, "loss": 0.8731, "step": 440 }, { "epoch": 0.7944389275074478, "grad_norm": 0.6017117570992859, "learning_rate": 1.7240929312416545e-05, "loss": 0.8758, "step": 450 }, { "epoch": 0.8120931258965023, "grad_norm": 0.52374499191387, "learning_rate": 1.710806182339848e-05, "loss": 0.8728, "step": 460 }, { "epoch": 0.8297473242855566, "grad_norm": 0.5995068247811851, "learning_rate": 1.697260820305044e-05, "loss": 0.8867, "step": 470 }, { "epoch": 0.8474015226746111, "grad_norm": 0.5610337196661641, "learning_rate": 1.683461773356213e-05, "loss": 0.865, "step": 480 }, { "epoch": 0.8650557210636655, "grad_norm": 0.5740324652987965, "learning_rate": 1.669414062010696e-05, "loss": 0.8795, "step": 490 }, { "epoch": 0.8827099194527198, "grad_norm": 0.6580353279100991, "learning_rate": 1.6551227972575823e-05, "loss": 0.8642, "step": 500 }, { "epoch": 0.9003641178417743, "grad_norm": 0.5609839098459638, "learning_rate": 1.6405931786981753e-05, "loss": 0.8666, "step": 510 }, { "epoch": 0.9180183162308286, "grad_norm": 0.606711772111839, "learning_rate": 1.6258304926542183e-05, "loss": 0.85, "step": 520 }, { "epoch": 0.935672514619883, "grad_norm": 0.5282629393908134, "learning_rate": 1.610840110244568e-05, "loss": 0.8703, "step": 530 }, { "epoch": 0.9533267130089375, "grad_norm": 0.5716697646332912, "learning_rate": 1.5956274854310157e-05, "loss": 0.8878, "step": 540 }, { "epoch": 0.9709809113979918, "grad_norm": 0.6293891083073122, "learning_rate": 1.5801981530339695e-05, "loss": 0.8505, "step": 550 }, { "epoch": 0.9886351097870463, "grad_norm": 0.5032178564593078, "learning_rate": 1.5645577267187163e-05, "loss": 0.8625, "step": 560 }, { "epoch": 0.9992276288204789, "eval_loss": 0.7792695760726929, "eval_runtime": 577.071, "eval_samples_per_second": 9.642, "eval_steps_per_second": 2.41, "step": 566 }, { "epoch": 1.0070616793556217, "grad_norm": 0.565136350859829, "learning_rate": 1.5487118969529973e-05, "loss": 0.7727, "step": 570 }, { "epoch": 1.024715877744676, "grad_norm": 0.4954225833969141, "learning_rate": 1.5326664289366406e-05, "loss": 0.7606, "step": 580 }, { "epoch": 1.0423700761337304, "grad_norm": 0.523592237844004, "learning_rate": 1.516427160504006e-05, "loss": 0.7597, "step": 590 }, { "epoch": 1.060024274522785, "grad_norm": 0.5408055640730495, "learning_rate": 1.5000000000000002e-05, "loss": 0.7536, "step": 600 }, { "epoch": 1.0776784729118394, "grad_norm": 0.5246834404536742, "learning_rate": 1.4833909241304391e-05, "loss": 0.7562, "step": 610 }, { "epoch": 1.0953326713008937, "grad_norm": 0.5643207747274573, "learning_rate": 1.4666059757875397e-05, "loss": 0.7612, "step": 620 }, { "epoch": 1.112986869689948, "grad_norm": 0.517010952574902, "learning_rate": 1.4496512618513289e-05, "loss": 0.7575, "step": 630 }, { "epoch": 1.1306410680790024, "grad_norm": 0.5261650605787747, "learning_rate": 1.4325329509677743e-05, "loss": 0.7473, "step": 640 }, { "epoch": 1.148295266468057, "grad_norm": 0.5342217209976181, "learning_rate": 1.4152572713044397e-05, "loss": 0.7645, "step": 650 }, { "epoch": 1.1659494648571114, "grad_norm": 0.5394360986026229, "learning_rate": 1.3978305082844876e-05, "loss": 0.7456, "step": 660 }, { "epoch": 1.1836036632461657, "grad_norm": 0.5251341793704711, "learning_rate": 1.3802590022998483e-05, "loss": 0.7564, "step": 670 }, { "epoch": 1.20125786163522, "grad_norm": 0.5365883139849793, "learning_rate": 1.3625491464043909e-05, "loss": 0.7662, "step": 680 }, { "epoch": 1.2189120600242744, "grad_norm": 0.5851452149462395, "learning_rate": 1.3447073839879339e-05, "loss": 0.7467, "step": 690 }, { "epoch": 1.236566258413329, "grad_norm": 0.49698239669222605, "learning_rate": 1.3267402064319415e-05, "loss": 0.7468, "step": 700 }, { "epoch": 1.2542204568023834, "grad_norm": 0.5490853297859661, "learning_rate": 1.3086541507477598e-05, "loss": 0.7496, "step": 710 }, { "epoch": 1.2718746551914377, "grad_norm": 0.49838225273553083, "learning_rate": 1.2904557971982514e-05, "loss": 0.7465, "step": 720 }, { "epoch": 1.289528853580492, "grad_norm": 0.5406712310299336, "learning_rate": 1.2721517669036929e-05, "loss": 0.7473, "step": 730 }, { "epoch": 1.3071830519695464, "grad_norm": 0.5048219957782835, "learning_rate": 1.253748719432809e-05, "loss": 0.734, "step": 740 }, { "epoch": 1.324837250358601, "grad_norm": 0.47951222482587946, "learning_rate": 1.2352533503798156e-05, "loss": 0.7431, "step": 750 }, { "epoch": 1.3424914487476554, "grad_norm": 0.5851936786168687, "learning_rate": 1.2166723889283574e-05, "loss": 0.7472, "step": 760 }, { "epoch": 1.3601456471367097, "grad_norm": 0.485279585177607, "learning_rate": 1.1980125954032239e-05, "loss": 0.7366, "step": 770 }, { "epoch": 1.377799845525764, "grad_norm": 0.5453687928417636, "learning_rate": 1.1792807588107358e-05, "loss": 0.746, "step": 780 }, { "epoch": 1.3954540439148184, "grad_norm": 0.5866620790608368, "learning_rate": 1.1604836943686957e-05, "loss": 0.7487, "step": 790 }, { "epoch": 1.4131082423038728, "grad_norm": 0.5056556922549863, "learning_rate": 1.141628241026802e-05, "loss": 0.7384, "step": 800 }, { "epoch": 1.4307624406929274, "grad_norm": 0.5141576590110138, "learning_rate": 1.1227212589784297e-05, "loss": 0.7398, "step": 810 }, { "epoch": 1.4484166390819817, "grad_norm": 0.48402830116887263, "learning_rate": 1.1037696271646805e-05, "loss": 0.7369, "step": 820 }, { "epoch": 1.466070837471036, "grad_norm": 0.5165402289756648, "learning_rate": 1.0847802407716128e-05, "loss": 0.7482, "step": 830 }, { "epoch": 1.4837250358600904, "grad_norm": 0.4874182996900902, "learning_rate": 1.0657600087215618e-05, "loss": 0.7375, "step": 840 }, { "epoch": 1.501379234249145, "grad_norm": 0.49738455424585243, "learning_rate": 1.0467158511594595e-05, "loss": 0.7397, "step": 850 }, { "epoch": 1.5190334326381993, "grad_norm": 0.49991861283295536, "learning_rate": 1.0276546969350757e-05, "loss": 0.7278, "step": 860 }, { "epoch": 1.5366876310272537, "grad_norm": 0.5667368371430999, "learning_rate": 1.0085834810820871e-05, "loss": 0.7301, "step": 870 }, { "epoch": 1.554341829416308, "grad_norm": 0.5144068476500945, "learning_rate": 9.89509142294901e-06, "loss": 0.7378, "step": 880 }, { "epoch": 1.5719960278053624, "grad_norm": 0.5137627935948821, "learning_rate": 9.704386204041438e-06, "loss": 0.7517, "step": 890 }, { "epoch": 1.5896502261944168, "grad_norm": 0.52904044022737, "learning_rate": 9.513788538517375e-06, "loss": 0.7407, "step": 900 }, { "epoch": 1.6073044245834711, "grad_norm": 0.5115595110168406, "learning_rate": 9.323367771664819e-06, "loss": 0.7338, "step": 910 }, { "epoch": 1.6249586229725257, "grad_norm": 0.48147125209335884, "learning_rate": 9.133193184410589e-06, "loss": 0.7362, "step": 920 }, { "epoch": 1.64261282136158, "grad_norm": 0.49180548035958394, "learning_rate": 8.943333968113808e-06, "loss": 0.7398, "step": 930 }, { "epoch": 1.6602670197506344, "grad_norm": 0.4867013851446762, "learning_rate": 8.753859199391951e-06, "loss": 0.7483, "step": 940 }, { "epoch": 1.677921218139689, "grad_norm": 0.4804051538514883, "learning_rate": 8.564837814988638e-06, "loss": 0.7307, "step": 950 }, { "epoch": 1.6955754165287433, "grad_norm": 0.4863063091894713, "learning_rate": 8.376338586692367e-06, "loss": 0.7311, "step": 960 }, { "epoch": 1.7132296149177977, "grad_norm": 0.5169386880097873, "learning_rate": 8.188430096315168e-06, "loss": 0.7327, "step": 970 }, { "epoch": 1.730883813306852, "grad_norm": 0.5153036409881078, "learning_rate": 8.00118071074049e-06, "loss": 0.7328, "step": 980 }, { "epoch": 1.7485380116959064, "grad_norm": 0.5006840768446958, "learning_rate": 7.814658557049175e-06, "loss": 0.746, "step": 990 }, { "epoch": 1.7661922100849607, "grad_norm": 0.5193653515770323, "learning_rate": 7.62893149773278e-06, "loss": 0.738, "step": 1000 }, { "epoch": 1.783846408474015, "grad_norm": 0.5162239370053442, "learning_rate": 7.4440671060030725e-06, "loss": 0.73, "step": 1010 }, { "epoch": 1.8015006068630697, "grad_norm": 0.48852121302241314, "learning_rate": 7.260132641206861e-06, "loss": 0.7394, "step": 1020 }, { "epoch": 1.819154805252124, "grad_norm": 0.5463945098209841, "learning_rate": 7.077195024354939e-06, "loss": 0.722, "step": 1030 }, { "epoch": 1.8368090036411784, "grad_norm": 0.49951753268549337, "learning_rate": 6.895320813774206e-06, "loss": 0.7133, "step": 1040 }, { "epoch": 1.854463202030233, "grad_norm": 0.5119500277521447, "learning_rate": 6.714576180891653e-06, "loss": 0.7184, "step": 1050 }, { "epoch": 1.8721174004192873, "grad_norm": 0.6424312472729824, "learning_rate": 6.535026886159221e-06, "loss": 0.7186, "step": 1060 }, { "epoch": 1.8897715988083417, "grad_norm": 0.5373842851342149, "learning_rate": 6.356738255128068e-06, "loss": 0.7211, "step": 1070 }, { "epoch": 1.907425797197396, "grad_norm": 0.4982902687166508, "learning_rate": 6.179775154681184e-06, "loss": 0.7161, "step": 1080 }, { "epoch": 1.9250799955864504, "grad_norm": 0.5414062516129651, "learning_rate": 6.004201969432771e-06, "loss": 0.7303, "step": 1090 }, { "epoch": 1.9427341939755047, "grad_norm": 0.462895367066655, "learning_rate": 5.830082578303193e-06, "loss": 0.7249, "step": 1100 }, { "epoch": 1.960388392364559, "grad_norm": 0.49592685426373223, "learning_rate": 5.6574803312778196e-06, "loss": 0.7253, "step": 1110 }, { "epoch": 1.9780425907536134, "grad_norm": 0.4758478775588506, "learning_rate": 5.486458026358381e-06, "loss": 0.7126, "step": 1120 }, { "epoch": 1.995696789142668, "grad_norm": 0.4969466321267014, "learning_rate": 5.317077886715105e-06, "loss": 0.724, "step": 1130 }, { "epoch": 1.9992276288204789, "eval_loss": 0.6677735447883606, "eval_runtime": 527.1683, "eval_samples_per_second": 10.555, "eval_steps_per_second": 2.639, "step": 1132 }, { "epoch": 2.0133509875317226, "grad_norm": 0.5185737619412808, "learning_rate": 5.14940153804804e-06, "loss": 0.6582, "step": 1140 }, { "epoch": 2.031005185920777, "grad_norm": 0.447637858983293, "learning_rate": 4.983489986165708e-06, "loss": 0.6365, "step": 1150 }, { "epoch": 2.0486593843098313, "grad_norm": 0.5152236160209434, "learning_rate": 4.819403594789335e-06, "loss": 0.6351, "step": 1160 }, { "epoch": 2.0663135826988857, "grad_norm": 0.6273558161173919, "learning_rate": 4.6572020635906535e-06, "loss": 0.6414, "step": 1170 }, { "epoch": 2.08396778108794, "grad_norm": 0.4926770781993292, "learning_rate": 4.4969444064713506e-06, "loss": 0.6335, "step": 1180 }, { "epoch": 2.1016219794769944, "grad_norm": 0.48793300929617467, "learning_rate": 4.338688930091982e-06, "loss": 0.6397, "step": 1190 }, { "epoch": 2.1192761778660487, "grad_norm": 0.48663482440046746, "learning_rate": 4.182493212658224e-06, "loss": 0.652, "step": 1200 }, { "epoch": 2.136930376255103, "grad_norm": 0.4760935997684315, "learning_rate": 4.028414082972141e-06, "loss": 0.6416, "step": 1210 }, { "epoch": 2.1545845746441574, "grad_norm": 0.4582415888721039, "learning_rate": 3.876507599756136e-06, "loss": 0.6295, "step": 1220 }, { "epoch": 2.172238773033212, "grad_norm": 0.4464518146441672, "learning_rate": 3.7268290312570622e-06, "loss": 0.6518, "step": 1230 }, { "epoch": 2.1898929714222666, "grad_norm": 0.4796659798369155, "learning_rate": 3.579432835137928e-06, "loss": 0.6376, "step": 1240 }, { "epoch": 2.207547169811321, "grad_norm": 0.4613709158191507, "learning_rate": 3.434372638664526e-06, "loss": 0.6418, "step": 1250 }, { "epoch": 2.2252013682003753, "grad_norm": 0.4620560070335567, "learning_rate": 3.2917012191941955e-06, "loss": 0.6331, "step": 1260 }, { "epoch": 2.2428555665894296, "grad_norm": 0.4403307799157334, "learning_rate": 3.151470484973792e-06, "loss": 0.6448, "step": 1270 }, { "epoch": 2.260509764978484, "grad_norm": 0.46979962256612806, "learning_rate": 3.0137314562538742e-06, "loss": 0.6333, "step": 1280 }, { "epoch": 2.2781639633675383, "grad_norm": 0.47605403770220545, "learning_rate": 2.8785342467259568e-06, "loss": 0.6372, "step": 1290 }, { "epoch": 2.2958181617565927, "grad_norm": 0.44577983389984577, "learning_rate": 2.745928045289631e-06, "loss": 0.6464, "step": 1300 }, { "epoch": 2.313472360145647, "grad_norm": 0.44706989287780297, "learning_rate": 2.6159610981561134e-06, "loss": 0.626, "step": 1310 }, { "epoch": 2.3311265585347014, "grad_norm": 0.44473280016435734, "learning_rate": 2.4886806912948034e-06, "loss": 0.6362, "step": 1320 }, { "epoch": 2.3487807569237558, "grad_norm": 0.4674495930967707, "learning_rate": 2.3641331332291793e-06, "loss": 0.6453, "step": 1330 }, { "epoch": 2.36643495531281, "grad_norm": 0.4455415304277737, "learning_rate": 2.2423637381883533e-06, "loss": 0.6289, "step": 1340 }, { "epoch": 2.384089153701865, "grad_norm": 0.4544071638805097, "learning_rate": 2.123416809620351e-06, "loss": 0.6331, "step": 1350 }, { "epoch": 2.4017433520909193, "grad_norm": 0.4723689800288618, "learning_rate": 2.007335624073157e-06, "loss": 0.645, "step": 1360 }, { "epoch": 2.4193975504799736, "grad_norm": 0.45220258933192226, "learning_rate": 1.8941624154493731e-06, "loss": 0.6451, "step": 1370 }, { "epoch": 2.437051748869028, "grad_norm": 0.4387363718825142, "learning_rate": 1.7839383596402382e-06, "loss": 0.6339, "step": 1380 }, { "epoch": 2.4547059472580823, "grad_norm": 0.52315736570108, "learning_rate": 1.6767035595445614e-06, "loss": 0.6289, "step": 1390 }, { "epoch": 2.4723601456471367, "grad_norm": 0.44143122222775955, "learning_rate": 1.5724970304780662e-06, "loss": 0.6307, "step": 1400 }, { "epoch": 2.490014344036191, "grad_norm": 0.49564787712258485, "learning_rate": 1.4713566859784045e-06, "loss": 0.6426, "step": 1410 }, { "epoch": 2.5076685424252454, "grad_norm": 0.44402047508915227, "learning_rate": 1.373319324011061e-06, "loss": 0.6368, "step": 1420 }, { "epoch": 2.5253227408142997, "grad_norm": 0.44130611579940193, "learning_rate": 1.2784206135811184e-06, "loss": 0.6398, "step": 1430 }, { "epoch": 2.5429769392033545, "grad_norm": 0.4764351411669087, "learning_rate": 1.1866950817557743e-06, "loss": 0.6349, "step": 1440 }, { "epoch": 2.5606311375924085, "grad_norm": 0.4398014393212803, "learning_rate": 1.0981761011023317e-06, "loss": 0.6218, "step": 1450 }, { "epoch": 2.5782853359814633, "grad_norm": 0.4470462981839341, "learning_rate": 1.0128958775462393e-06, "loss": 0.6241, "step": 1460 }, { "epoch": 2.5959395343705176, "grad_norm": 0.43673398026523563, "learning_rate": 9.308854386535849e-07, "loss": 0.6408, "step": 1470 }, { "epoch": 2.613593732759572, "grad_norm": 0.4505573168779551, "learning_rate": 8.521746223423088e-07, "loss": 0.6283, "step": 1480 }, { "epoch": 2.6312479311486263, "grad_norm": 0.4440648392714403, "learning_rate": 7.767920660262529e-07, "loss": 0.6285, "step": 1490 }, { "epoch": 2.6489021295376807, "grad_norm": 0.4414194215856468, "learning_rate": 7.047651961959978e-07, "loss": 0.6244, "step": 1500 }, { "epoch": 2.666556327926735, "grad_norm": 0.4440446143190118, "learning_rate": 6.361202184402515e-07, "loss": 0.6332, "step": 1510 }, { "epoch": 2.6842105263157894, "grad_norm": 0.4706364746939079, "learning_rate": 5.708821079114612e-07, "loss": 0.6362, "step": 1520 }, { "epoch": 2.7018647247048437, "grad_norm": 0.43340031061371953, "learning_rate": 5.090746002390734e-07, "loss": 0.6349, "step": 1530 }, { "epoch": 2.719518923093898, "grad_norm": 0.4246335933712338, "learning_rate": 4.507201828937935e-07, "loss": 0.6383, "step": 1540 }, { "epoch": 2.737173121482953, "grad_norm": 0.44029888263426625, "learning_rate": 3.958400870059476e-07, "loss": 0.6353, "step": 1550 }, { "epoch": 2.754827319872007, "grad_norm": 0.4317011108249921, "learning_rate": 3.444542796409478e-07, "loss": 0.632, "step": 1560 }, { "epoch": 2.7724815182610616, "grad_norm": 0.43100429391620887, "learning_rate": 2.965814565346548e-07, "loss": 0.6382, "step": 1570 }, { "epoch": 2.790135716650116, "grad_norm": 0.4769391216396139, "learning_rate": 2.522390352912985e-07, "loss": 0.6281, "step": 1580 }, { "epoch": 2.8077899150391703, "grad_norm": 0.44400518164711406, "learning_rate": 2.1144314904642194e-07, "loss": 0.6312, "step": 1590 }, { "epoch": 2.8254441134282247, "grad_norm": 0.4518349510084993, "learning_rate": 1.7420864059714215e-07, "loss": 0.6319, "step": 1600 }, { "epoch": 2.843098311817279, "grad_norm": 0.4353149820484635, "learning_rate": 1.405490570018908e-07, "loss": 0.6352, "step": 1610 }, { "epoch": 2.8607525102063334, "grad_norm": 0.4249015575790441, "learning_rate": 1.1047664465157592e-07, "loss": 0.6471, "step": 1620 }, { "epoch": 2.8784067085953877, "grad_norm": 0.42688299650964856, "learning_rate": 8.400234481397041e-08, "loss": 0.6232, "step": 1630 }, { "epoch": 2.896060906984442, "grad_norm": 0.4550228428201464, "learning_rate": 6.113578965293854e-08, "loss": 0.6426, "step": 1640 }, { "epoch": 2.9137151053734964, "grad_norm": 0.4424051249216907, "learning_rate": 4.188529872396374e-08, "loss": 0.6162, "step": 1650 }, { "epoch": 2.9313693037625512, "grad_norm": 0.43719698480253905, "learning_rate": 2.625787594723428e-08, "loss": 0.6353, "step": 1660 }, { "epoch": 2.9490235021516056, "grad_norm": 0.4552786221630045, "learning_rate": 1.4259207059403868e-08, "loss": 0.6355, "step": 1670 }, { "epoch": 2.96667770054066, "grad_norm": 0.4300462034554104, "learning_rate": 5.8936575449475284e-09, "loss": 0.6366, "step": 1680 }, { "epoch": 2.9843318989297143, "grad_norm": 0.4321948802854605, "learning_rate": 1.1642710478598772e-09, "loss": 0.6383, "step": 1690 }, { "epoch": 2.9984552576409578, "step": 1698, "total_flos": 2844841381724160.0, "train_loss": 0.0, "train_runtime": 8.0551, "train_samples_per_second": 27002.356, "train_steps_per_second": 210.799 } ], "logging_steps": 10, "max_steps": 1698, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2844841381724160.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }