{ "best_metric": null, "best_model_checkpoint": null, "epoch": 17.77459749552773, "eval_steps": 500, "global_step": 1242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14311270125223613, "grad_norm": 7.146514892578125, "learning_rate": 0.00039677938808373593, "loss": 3.5751, "step": 10 }, { "epoch": 0.28622540250447226, "grad_norm": 2.1325223445892334, "learning_rate": 0.00039355877616747184, "loss": 0.939, "step": 20 }, { "epoch": 0.4293381037567084, "grad_norm": 0.8520782589912415, "learning_rate": 0.00039033816425120774, "loss": 0.2653, "step": 30 }, { "epoch": 0.5724508050089445, "grad_norm": 0.7653748393058777, "learning_rate": 0.00038711755233494365, "loss": 0.1603, "step": 40 }, { "epoch": 0.7155635062611807, "grad_norm": 0.661469578742981, "learning_rate": 0.00038389694041867956, "loss": 0.1886, "step": 50 }, { "epoch": 0.8586762075134168, "grad_norm": 0.39610955119132996, "learning_rate": 0.00038067632850241547, "loss": 0.1859, "step": 60 }, { "epoch": 1.0017889087656529, "grad_norm": 0.4488755464553833, "learning_rate": 0.0003774557165861514, "loss": 0.1538, "step": 70 }, { "epoch": 1.144901610017889, "grad_norm": 0.2944377362728119, "learning_rate": 0.00037423510466988734, "loss": 0.1195, "step": 80 }, { "epoch": 1.2880143112701252, "grad_norm": 0.29124024510383606, "learning_rate": 0.0003710144927536232, "loss": 0.1271, "step": 90 }, { "epoch": 1.4311270125223614, "grad_norm": 0.42328736186027527, "learning_rate": 0.0003677938808373591, "loss": 0.1018, "step": 100 }, { "epoch": 1.5742397137745976, "grad_norm": 0.3259565234184265, "learning_rate": 0.00036457326892109506, "loss": 0.0848, "step": 110 }, { "epoch": 1.7173524150268338, "grad_norm": 0.479124516248703, "learning_rate": 0.0003613526570048309, "loss": 0.106, "step": 120 }, { "epoch": 1.8604651162790697, "grad_norm": 0.40788090229034424, "learning_rate": 0.0003581320450885668, "loss": 0.0969, "step": 130 }, { "epoch": 2.0035778175313057, "grad_norm": 0.3574964106082916, "learning_rate": 0.0003549114331723028, "loss": 0.124, "step": 140 }, { "epoch": 2.146690518783542, "grad_norm": 0.37805065512657166, "learning_rate": 0.0003516908212560387, "loss": 0.0491, "step": 150 }, { "epoch": 2.289803220035778, "grad_norm": 0.25937220454216003, "learning_rate": 0.00034847020933977455, "loss": 0.0669, "step": 160 }, { "epoch": 2.4329159212880143, "grad_norm": 0.34056201577186584, "learning_rate": 0.00034524959742351046, "loss": 0.0595, "step": 170 }, { "epoch": 2.5760286225402504, "grad_norm": 0.30211707949638367, "learning_rate": 0.0003420289855072464, "loss": 0.0648, "step": 180 }, { "epoch": 2.7191413237924866, "grad_norm": 0.18458786606788635, "learning_rate": 0.0003388083735909823, "loss": 0.0545, "step": 190 }, { "epoch": 2.862254025044723, "grad_norm": 0.27384912967681885, "learning_rate": 0.0003355877616747182, "loss": 0.0684, "step": 200 }, { "epoch": 3.005366726296959, "grad_norm": 0.16877304017543793, "learning_rate": 0.00033236714975845414, "loss": 0.0695, "step": 210 }, { "epoch": 3.148479427549195, "grad_norm": 0.07739146798849106, "learning_rate": 0.00032914653784219005, "loss": 0.0256, "step": 220 }, { "epoch": 3.2915921288014314, "grad_norm": 0.2832132577896118, "learning_rate": 0.0003259259259259259, "loss": 0.0263, "step": 230 }, { "epoch": 3.434704830053667, "grad_norm": 0.21412289142608643, "learning_rate": 0.00032270531400966187, "loss": 0.0287, "step": 240 }, { "epoch": 3.5778175313059033, "grad_norm": 0.1840696483850479, "learning_rate": 0.0003194847020933978, "loss": 0.0469, "step": 250 }, { "epoch": 3.7209302325581395, "grad_norm": 0.34246236085891724, "learning_rate": 0.00031626409017713363, "loss": 0.0243, "step": 260 }, { "epoch": 3.8640429338103757, "grad_norm": 0.056173525750637054, "learning_rate": 0.0003130434782608696, "loss": 0.0252, "step": 270 }, { "epoch": 4.007155635062611, "grad_norm": 0.09256428480148315, "learning_rate": 0.0003098228663446055, "loss": 0.0216, "step": 280 }, { "epoch": 4.150268336314848, "grad_norm": 0.20085078477859497, "learning_rate": 0.0003066022544283414, "loss": 0.0102, "step": 290 }, { "epoch": 4.293381037567084, "grad_norm": 0.021982286125421524, "learning_rate": 0.0003033816425120773, "loss": 0.0131, "step": 300 }, { "epoch": 4.43649373881932, "grad_norm": 0.054368916898965836, "learning_rate": 0.0003001610305958132, "loss": 0.0145, "step": 310 }, { "epoch": 4.579606440071556, "grad_norm": 0.0868581086397171, "learning_rate": 0.00029694041867954913, "loss": 0.0181, "step": 320 }, { "epoch": 4.722719141323792, "grad_norm": 0.24308475852012634, "learning_rate": 0.00029371980676328504, "loss": 0.0125, "step": 330 }, { "epoch": 4.8658318425760285, "grad_norm": 0.14394602179527283, "learning_rate": 0.00029049919484702095, "loss": 0.0149, "step": 340 }, { "epoch": 5.008944543828265, "grad_norm": 0.05040862783789635, "learning_rate": 0.00028727858293075686, "loss": 0.0096, "step": 350 }, { "epoch": 5.152057245080501, "grad_norm": 0.28047820925712585, "learning_rate": 0.00028405797101449276, "loss": 0.0032, "step": 360 }, { "epoch": 5.295169946332737, "grad_norm": 0.07502233237028122, "learning_rate": 0.0002808373590982287, "loss": 0.0038, "step": 370 }, { "epoch": 5.438282647584973, "grad_norm": 0.8537871837615967, "learning_rate": 0.0002776167471819646, "loss": 0.0073, "step": 380 }, { "epoch": 5.5813953488372094, "grad_norm": 0.005727715790271759, "learning_rate": 0.0002743961352657005, "loss": 0.0106, "step": 390 }, { "epoch": 5.724508050089446, "grad_norm": 0.04042937234044075, "learning_rate": 0.0002711755233494364, "loss": 0.0041, "step": 400 }, { "epoch": 5.867620751341682, "grad_norm": 0.11248348653316498, "learning_rate": 0.0002679549114331723, "loss": 0.003, "step": 410 }, { "epoch": 6.010733452593918, "grad_norm": 0.014976495876908302, "learning_rate": 0.00026473429951690827, "loss": 0.0067, "step": 420 }, { "epoch": 6.153846153846154, "grad_norm": 0.15070898830890656, "learning_rate": 0.0002615136876006441, "loss": 0.0035, "step": 430 }, { "epoch": 6.29695885509839, "grad_norm": 0.0066925715655088425, "learning_rate": 0.00025829307568438003, "loss": 0.0027, "step": 440 }, { "epoch": 6.440071556350626, "grad_norm": 0.015314973890781403, "learning_rate": 0.00025507246376811594, "loss": 0.0045, "step": 450 }, { "epoch": 6.583184257602863, "grad_norm": 0.030470581725239754, "learning_rate": 0.00025185185185185185, "loss": 0.0029, "step": 460 }, { "epoch": 6.726296958855098, "grad_norm": 0.0092542115598917, "learning_rate": 0.00024863123993558775, "loss": 0.0043, "step": 470 }, { "epoch": 6.869409660107334, "grad_norm": 0.016118695959448814, "learning_rate": 0.00024541062801932366, "loss": 0.0011, "step": 480 }, { "epoch": 7.01252236135957, "grad_norm": 0.01760493591427803, "learning_rate": 0.0002421900161030596, "loss": 0.004, "step": 490 }, { "epoch": 7.155635062611807, "grad_norm": 0.0077838534489274025, "learning_rate": 0.0002389694041867955, "loss": 0.0004, "step": 500 }, { "epoch": 7.298747763864043, "grad_norm": 0.006766254547983408, "learning_rate": 0.00023574879227053139, "loss": 0.0015, "step": 510 }, { "epoch": 7.441860465116279, "grad_norm": 0.009164445102214813, "learning_rate": 0.00023252818035426732, "loss": 0.0006, "step": 520 }, { "epoch": 7.584973166368515, "grad_norm": 0.002783432835713029, "learning_rate": 0.00022930756843800323, "loss": 0.0005, "step": 530 }, { "epoch": 7.728085867620751, "grad_norm": 0.15486985445022583, "learning_rate": 0.0002260869565217391, "loss": 0.0036, "step": 540 }, { "epoch": 7.8711985688729875, "grad_norm": 0.042650897055864334, "learning_rate": 0.00022286634460547507, "loss": 0.0006, "step": 550 }, { "epoch": 8.014311270125223, "grad_norm": 0.0018309111474081874, "learning_rate": 0.00021964573268921095, "loss": 0.0011, "step": 560 }, { "epoch": 8.15742397137746, "grad_norm": 0.005560223013162613, "learning_rate": 0.00021642512077294686, "loss": 0.001, "step": 570 }, { "epoch": 8.300536672629695, "grad_norm": 0.03727242350578308, "learning_rate": 0.0002132045088566828, "loss": 0.0015, "step": 580 }, { "epoch": 8.443649373881932, "grad_norm": 0.34321093559265137, "learning_rate": 0.00020998389694041868, "loss": 0.0031, "step": 590 }, { "epoch": 8.586762075134168, "grad_norm": 0.020942572504281998, "learning_rate": 0.00020676328502415459, "loss": 0.0022, "step": 600 }, { "epoch": 8.729874776386405, "grad_norm": 0.010578208602964878, "learning_rate": 0.00020354267310789052, "loss": 0.0006, "step": 610 }, { "epoch": 8.87298747763864, "grad_norm": 0.004323468543589115, "learning_rate": 0.00020032206119162643, "loss": 0.0005, "step": 620 }, { "epoch": 9.016100178890877, "grad_norm": 0.1614646017551422, "learning_rate": 0.00019710144927536234, "loss": 0.0011, "step": 630 }, { "epoch": 9.159212880143112, "grad_norm": 0.0012326347641646862, "learning_rate": 0.00019388083735909825, "loss": 0.0005, "step": 640 }, { "epoch": 9.30232558139535, "grad_norm": 0.006543063558638096, "learning_rate": 0.00019066022544283415, "loss": 0.0002, "step": 650 }, { "epoch": 9.445438282647585, "grad_norm": 0.21280421316623688, "learning_rate": 0.00018743961352657006, "loss": 0.0003, "step": 660 }, { "epoch": 9.588550983899822, "grad_norm": 0.006327577400952578, "learning_rate": 0.00018421900161030597, "loss": 0.0002, "step": 670 }, { "epoch": 9.731663685152057, "grad_norm": 0.0025285291485488415, "learning_rate": 0.00018099838969404188, "loss": 0.0001, "step": 680 }, { "epoch": 9.874776386404294, "grad_norm": 0.0014309959951788187, "learning_rate": 0.00017777777777777779, "loss": 0.0001, "step": 690 }, { "epoch": 10.01788908765653, "grad_norm": 0.0024150668177753687, "learning_rate": 0.0001745571658615137, "loss": 0.0001, "step": 700 }, { "epoch": 10.161001788908766, "grad_norm": 0.0025761763099581003, "learning_rate": 0.0001713365539452496, "loss": 0.0001, "step": 710 }, { "epoch": 10.304114490161002, "grad_norm": 0.0017020882805809379, "learning_rate": 0.0001681159420289855, "loss": 0.0001, "step": 720 }, { "epoch": 10.447227191413237, "grad_norm": 0.0028596080373972654, "learning_rate": 0.00016489533011272142, "loss": 0.0001, "step": 730 }, { "epoch": 10.590339892665474, "grad_norm": 0.0019378801807761192, "learning_rate": 0.00016167471819645735, "loss": 0.0001, "step": 740 }, { "epoch": 10.73345259391771, "grad_norm": 0.001211544731631875, "learning_rate": 0.00015845410628019323, "loss": 0.0001, "step": 750 }, { "epoch": 10.876565295169947, "grad_norm": 0.0033484594896435738, "learning_rate": 0.00015523349436392914, "loss": 0.0001, "step": 760 }, { "epoch": 11.019677996422182, "grad_norm": 0.001493943389505148, "learning_rate": 0.00015201288244766508, "loss": 0.0001, "step": 770 }, { "epoch": 11.162790697674419, "grad_norm": 0.0019909776747226715, "learning_rate": 0.00014879227053140096, "loss": 0.0001, "step": 780 }, { "epoch": 11.305903398926654, "grad_norm": 0.0011982638388872147, "learning_rate": 0.0001455716586151369, "loss": 0.0001, "step": 790 }, { "epoch": 11.449016100178891, "grad_norm": 0.0015958467265591025, "learning_rate": 0.0001423510466988728, "loss": 0.0001, "step": 800 }, { "epoch": 11.592128801431127, "grad_norm": 0.0008461058023385704, "learning_rate": 0.0001391304347826087, "loss": 0.0001, "step": 810 }, { "epoch": 11.735241502683364, "grad_norm": 0.0005576548865064979, "learning_rate": 0.00013590982286634462, "loss": 0.0001, "step": 820 }, { "epoch": 11.878354203935599, "grad_norm": 0.0017713948618620634, "learning_rate": 0.0001326892109500805, "loss": 0.0001, "step": 830 }, { "epoch": 12.021466905187836, "grad_norm": 0.001206880551762879, "learning_rate": 0.00012946859903381643, "loss": 0.0001, "step": 840 }, { "epoch": 12.164579606440071, "grad_norm": 0.0013083606027066708, "learning_rate": 0.00012624798711755234, "loss": 0.0001, "step": 850 }, { "epoch": 12.307692307692308, "grad_norm": 0.0008201482123695314, "learning_rate": 0.00012302737520128825, "loss": 0.0001, "step": 860 }, { "epoch": 12.450805008944544, "grad_norm": 0.0006802495336160064, "learning_rate": 0.00011980676328502416, "loss": 0.0001, "step": 870 }, { "epoch": 12.59391771019678, "grad_norm": 0.0017911783652380109, "learning_rate": 0.00011658615136876008, "loss": 0.0001, "step": 880 }, { "epoch": 12.737030411449016, "grad_norm": 0.0007388959056697786, "learning_rate": 0.00011336553945249598, "loss": 0.0, "step": 890 }, { "epoch": 12.880143112701251, "grad_norm": 0.0007727427291683853, "learning_rate": 0.00011014492753623188, "loss": 0.0001, "step": 900 }, { "epoch": 13.023255813953488, "grad_norm": 0.0008818788919597864, "learning_rate": 0.0001069243156199678, "loss": 0.0001, "step": 910 }, { "epoch": 13.166368515205724, "grad_norm": 0.0005572364898398519, "learning_rate": 0.0001037037037037037, "loss": 0.0, "step": 920 }, { "epoch": 13.30948121645796, "grad_norm": 0.0009758470696397126, "learning_rate": 0.00010048309178743962, "loss": 0.0001, "step": 930 }, { "epoch": 13.452593917710196, "grad_norm": 0.0003166435344610363, "learning_rate": 9.726247987117553e-05, "loss": 0.0001, "step": 940 }, { "epoch": 13.595706618962433, "grad_norm": 0.0005005749990232289, "learning_rate": 9.404186795491144e-05, "loss": 0.0, "step": 950 }, { "epoch": 13.738819320214668, "grad_norm": 0.0003304154670331627, "learning_rate": 9.082125603864735e-05, "loss": 0.0001, "step": 960 }, { "epoch": 13.881932021466906, "grad_norm": 0.0005377003108151257, "learning_rate": 8.760064412238325e-05, "loss": 0.0001, "step": 970 }, { "epoch": 14.02504472271914, "grad_norm": 0.0015913191018626094, "learning_rate": 8.438003220611916e-05, "loss": 0.0001, "step": 980 }, { "epoch": 14.168157423971378, "grad_norm": 0.000676720985211432, "learning_rate": 8.115942028985508e-05, "loss": 0.0001, "step": 990 }, { "epoch": 14.311270125223613, "grad_norm": 0.0007494900492019951, "learning_rate": 7.793880837359099e-05, "loss": 0.0001, "step": 1000 }, { "epoch": 14.45438282647585, "grad_norm": 0.0015422647120431066, "learning_rate": 7.47181964573269e-05, "loss": 0.0, "step": 1010 }, { "epoch": 14.597495527728086, "grad_norm": 0.0005012313486076891, "learning_rate": 7.14975845410628e-05, "loss": 0.0, "step": 1020 }, { "epoch": 14.740608228980323, "grad_norm": 0.0008338551269844174, "learning_rate": 6.827697262479872e-05, "loss": 0.0, "step": 1030 }, { "epoch": 14.883720930232558, "grad_norm": 0.0006810138584114611, "learning_rate": 6.505636070853462e-05, "loss": 0.0001, "step": 1040 }, { "epoch": 15.026833631484795, "grad_norm": 0.00043299293611198664, "learning_rate": 6.183574879227053e-05, "loss": 0.0, "step": 1050 }, { "epoch": 15.16994633273703, "grad_norm": 0.0005277034360915422, "learning_rate": 5.861513687600645e-05, "loss": 0.0, "step": 1060 }, { "epoch": 15.313059033989267, "grad_norm": 0.0006858156993985176, "learning_rate": 5.5394524959742355e-05, "loss": 0.0001, "step": 1070 }, { "epoch": 15.456171735241503, "grad_norm": 0.0008438636432401836, "learning_rate": 5.217391304347826e-05, "loss": 0.0, "step": 1080 }, { "epoch": 15.59928443649374, "grad_norm": 0.0012173138093203306, "learning_rate": 4.895330112721417e-05, "loss": 0.0001, "step": 1090 }, { "epoch": 15.742397137745975, "grad_norm": 0.002290137577801943, "learning_rate": 4.573268921095008e-05, "loss": 0.0, "step": 1100 }, { "epoch": 15.88550983899821, "grad_norm": 0.0005496228695847094, "learning_rate": 4.2512077294685994e-05, "loss": 0.0, "step": 1110 }, { "epoch": 16.028622540250446, "grad_norm": 0.0018827420426532626, "learning_rate": 3.92914653784219e-05, "loss": 0.0, "step": 1120 }, { "epoch": 16.171735241502684, "grad_norm": 0.00045006562140770257, "learning_rate": 3.607085346215781e-05, "loss": 0.0001, "step": 1130 }, { "epoch": 16.31484794275492, "grad_norm": 0.0005126325413584709, "learning_rate": 3.2850241545893725e-05, "loss": 0.0, "step": 1140 }, { "epoch": 16.457960644007155, "grad_norm": 0.00035093360929749906, "learning_rate": 2.962962962962963e-05, "loss": 0.0, "step": 1150 }, { "epoch": 16.60107334525939, "grad_norm": 0.0010109692811965942, "learning_rate": 2.640901771336554e-05, "loss": 0.0001, "step": 1160 }, { "epoch": 16.74418604651163, "grad_norm": 0.0006910230731591582, "learning_rate": 2.318840579710145e-05, "loss": 0.0, "step": 1170 }, { "epoch": 16.887298747763865, "grad_norm": 0.0004351095121819526, "learning_rate": 1.996779388083736e-05, "loss": 0.0, "step": 1180 }, { "epoch": 17.0304114490161, "grad_norm": 0.0006468660430982709, "learning_rate": 1.674718196457327e-05, "loss": 0.0, "step": 1190 }, { "epoch": 17.173524150268335, "grad_norm": 0.0002576902334112674, "learning_rate": 1.3526570048309179e-05, "loss": 0.0, "step": 1200 }, { "epoch": 17.316636851520574, "grad_norm": 0.0010522498050704598, "learning_rate": 1.0305958132045089e-05, "loss": 0.0, "step": 1210 }, { "epoch": 17.45974955277281, "grad_norm": 0.0007789513911120594, "learning_rate": 7.0853462157809985e-06, "loss": 0.0001, "step": 1220 }, { "epoch": 17.602862254025045, "grad_norm": 0.0009570368565618992, "learning_rate": 3.864734299516908e-06, "loss": 0.0, "step": 1230 }, { "epoch": 17.74597495527728, "grad_norm": 0.0009920781012624502, "learning_rate": 6.44122383252818e-07, "loss": 0.0, "step": 1240 }, { "epoch": 17.77459749552773, "step": 1242, "total_flos": 6.807580263736934e+16, "train_loss": 0.056659956144777014, "train_runtime": 9770.217, "train_samples_per_second": 6.177, "train_steps_per_second": 0.127 } ], "logging_steps": 10, "max_steps": 1242, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.807580263736934e+16, "train_batch_size": 6, "trial_name": null, "trial_params": null }