{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9806451612903224, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012903225806451613, "grad_norm": 0.882150089808769, "learning_rate": 8.333333333333334e-06, "loss": 1.3191, "step": 1 }, { "epoch": 0.025806451612903226, "grad_norm": 0.8369153094823952, "learning_rate": 1.6666666666666667e-05, "loss": 1.249, "step": 2 }, { "epoch": 0.03870967741935484, "grad_norm": 0.8525103918091212, "learning_rate": 2.5e-05, "loss": 1.2775, "step": 3 }, { "epoch": 0.05161290322580645, "grad_norm": 0.8113130093304075, "learning_rate": 3.3333333333333335e-05, "loss": 1.2577, "step": 4 }, { "epoch": 0.06451612903225806, "grad_norm": 0.7691226782403744, "learning_rate": 4.166666666666667e-05, "loss": 1.2275, "step": 5 }, { "epoch": 0.07741935483870968, "grad_norm": 0.5954210054804412, "learning_rate": 5e-05, "loss": 1.1159, "step": 6 }, { "epoch": 0.09032258064516129, "grad_norm": 0.48189256930049384, "learning_rate": 5.833333333333334e-05, "loss": 1.0593, "step": 7 }, { "epoch": 0.1032258064516129, "grad_norm": 0.5241879927945232, "learning_rate": 6.666666666666667e-05, "loss": 1.0031, "step": 8 }, { "epoch": 0.11612903225806452, "grad_norm": 0.5751865259411146, "learning_rate": 7.500000000000001e-05, "loss": 0.9263, "step": 9 }, { "epoch": 0.12903225806451613, "grad_norm": 0.5686526755807603, "learning_rate": 8.333333333333334e-05, "loss": 0.8146, "step": 10 }, { "epoch": 0.14193548387096774, "grad_norm": 0.5156906474251192, "learning_rate": 9.166666666666667e-05, "loss": 0.7583, "step": 11 }, { "epoch": 0.15483870967741936, "grad_norm": 0.4901634328534619, "learning_rate": 0.0001, "loss": 0.6686, "step": 12 }, { "epoch": 0.16774193548387098, "grad_norm": 0.376084270046461, "learning_rate": 0.00010833333333333333, "loss": 0.6005, "step": 13 }, { "epoch": 0.18064516129032257, "grad_norm": 0.2761318809240614, "learning_rate": 0.00011666666666666668, "loss": 0.5741, "step": 14 }, { "epoch": 0.1935483870967742, "grad_norm": 0.25038763704461725, "learning_rate": 0.000125, "loss": 0.5465, "step": 15 }, { "epoch": 0.2064516129032258, "grad_norm": 0.2214903977106201, "learning_rate": 0.00013333333333333334, "loss": 0.5138, "step": 16 }, { "epoch": 0.21935483870967742, "grad_norm": 0.28905541505099525, "learning_rate": 0.00014166666666666668, "loss": 0.5247, "step": 17 }, { "epoch": 0.23225806451612904, "grad_norm": 0.20699066633757193, "learning_rate": 0.00015000000000000001, "loss": 0.4978, "step": 18 }, { "epoch": 0.24516129032258063, "grad_norm": 0.219457528851344, "learning_rate": 0.00015833333333333332, "loss": 0.4924, "step": 19 }, { "epoch": 0.25806451612903225, "grad_norm": 0.16596853789220767, "learning_rate": 0.0001666666666666667, "loss": 0.4759, "step": 20 }, { "epoch": 0.2709677419354839, "grad_norm": 0.13228412371333673, "learning_rate": 0.000175, "loss": 0.4613, "step": 21 }, { "epoch": 0.2838709677419355, "grad_norm": 0.1421107856190867, "learning_rate": 0.00018333333333333334, "loss": 0.4852, "step": 22 }, { "epoch": 0.2967741935483871, "grad_norm": 0.12552928984887968, "learning_rate": 0.00019166666666666667, "loss": 0.4786, "step": 23 }, { "epoch": 0.3096774193548387, "grad_norm": 0.11489463060846784, "learning_rate": 0.0002, "loss": 0.4532, "step": 24 }, { "epoch": 0.3225806451612903, "grad_norm": 0.11476879539402507, "learning_rate": 0.00019998848349441062, "loss": 0.4454, "step": 25 }, { "epoch": 0.33548387096774196, "grad_norm": 0.1256602270101812, "learning_rate": 0.00019995393663024054, "loss": 0.4513, "step": 26 }, { "epoch": 0.34838709677419355, "grad_norm": 0.11833482485698336, "learning_rate": 0.00019989636736467278, "loss": 0.44, "step": 27 }, { "epoch": 0.36129032258064514, "grad_norm": 0.11124019681377781, "learning_rate": 0.00019981578895764273, "loss": 0.4439, "step": 28 }, { "epoch": 0.3741935483870968, "grad_norm": 0.10954971384477814, "learning_rate": 0.00019971221996878394, "loss": 0.4274, "step": 29 }, { "epoch": 0.3870967741935484, "grad_norm": 0.11422715129880294, "learning_rate": 0.00019958568425315314, "loss": 0.4254, "step": 30 }, { "epoch": 0.4, "grad_norm": 0.11262310014016527, "learning_rate": 0.00019943621095573586, "loss": 0.4204, "step": 31 }, { "epoch": 0.4129032258064516, "grad_norm": 0.11143099554463408, "learning_rate": 0.00019926383450473344, "loss": 0.4105, "step": 32 }, { "epoch": 0.4258064516129032, "grad_norm": 0.1088260973247734, "learning_rate": 0.00019906859460363307, "loss": 0.4136, "step": 33 }, { "epoch": 0.43870967741935485, "grad_norm": 0.10400753996611788, "learning_rate": 0.00019885053622206304, "loss": 0.4213, "step": 34 }, { "epoch": 0.45161290322580644, "grad_norm": 0.09587900896302251, "learning_rate": 0.0001986097095854347, "loss": 0.4085, "step": 35 }, { "epoch": 0.4645161290322581, "grad_norm": 0.10119603747308556, "learning_rate": 0.0001983461701633742, "loss": 0.4181, "step": 36 }, { "epoch": 0.4774193548387097, "grad_norm": 0.10062413136253176, "learning_rate": 0.00019805997865694614, "loss": 0.4098, "step": 37 }, { "epoch": 0.49032258064516127, "grad_norm": 0.09162394941720846, "learning_rate": 0.0001977512009846721, "loss": 0.4085, "step": 38 }, { "epoch": 0.5032258064516129, "grad_norm": 0.09269316443279575, "learning_rate": 0.00019741990826734794, "loss": 0.3994, "step": 39 }, { "epoch": 0.5161290322580645, "grad_norm": 0.08782581803238095, "learning_rate": 0.00019706617681166218, "loss": 0.3983, "step": 40 }, { "epoch": 0.5290322580645161, "grad_norm": 0.08665646987756218, "learning_rate": 0.00019669008809262062, "loss": 0.3938, "step": 41 }, { "epoch": 0.5419354838709678, "grad_norm": 0.09289388957990503, "learning_rate": 0.00019629172873477995, "loss": 0.396, "step": 42 }, { "epoch": 0.5548387096774193, "grad_norm": 0.09203344649472522, "learning_rate": 0.00019587119049229557, "loss": 0.4052, "step": 43 }, { "epoch": 0.567741935483871, "grad_norm": 0.08209774194723368, "learning_rate": 0.0001954285702277879, "loss": 0.3959, "step": 44 }, { "epoch": 0.5806451612903226, "grad_norm": 0.08595872863630391, "learning_rate": 0.00019496396989003193, "loss": 0.397, "step": 45 }, { "epoch": 0.5935483870967742, "grad_norm": 0.09041908237644536, "learning_rate": 0.00019447749649047542, "loss": 0.3992, "step": 46 }, { "epoch": 0.6064516129032258, "grad_norm": 0.08321976348844515, "learning_rate": 0.00019396926207859084, "loss": 0.4095, "step": 47 }, { "epoch": 0.6193548387096774, "grad_norm": 0.07887604040253807, "learning_rate": 0.00019343938371606712, "loss": 0.3866, "step": 48 }, { "epoch": 0.632258064516129, "grad_norm": 0.08329265943906447, "learning_rate": 0.00019288798344984672, "loss": 0.3985, "step": 49 }, { "epoch": 0.6451612903225806, "grad_norm": 0.08661703211305888, "learning_rate": 0.00019231518828401458, "loss": 0.3925, "step": 50 }, { "epoch": 0.6580645161290323, "grad_norm": 0.08382217550700771, "learning_rate": 0.00019172113015054532, "loss": 0.3862, "step": 51 }, { "epoch": 0.6709677419354839, "grad_norm": 0.08245124856491458, "learning_rate": 0.00019110594587891519, "loss": 0.3847, "step": 52 }, { "epoch": 0.6838709677419355, "grad_norm": 0.08319716279149986, "learning_rate": 0.00019046977716458626, "loss": 0.3775, "step": 53 }, { "epoch": 0.6967741935483871, "grad_norm": 0.08074648144423298, "learning_rate": 0.0001898127705363696, "loss": 0.3786, "step": 54 }, { "epoch": 0.7096774193548387, "grad_norm": 0.08472762376284584, "learning_rate": 0.0001891350773226754, "loss": 0.3923, "step": 55 }, { "epoch": 0.7225806451612903, "grad_norm": 0.08398076059437376, "learning_rate": 0.00018843685361665723, "loss": 0.3709, "step": 56 }, { "epoch": 0.7354838709677419, "grad_norm": 0.08465216102770419, "learning_rate": 0.00018771826024025946, "loss": 0.3818, "step": 57 }, { "epoch": 0.7483870967741936, "grad_norm": 0.09145572810056589, "learning_rate": 0.00018697946270717467, "loss": 0.39, "step": 58 }, { "epoch": 0.7612903225806451, "grad_norm": 0.08415188367023674, "learning_rate": 0.00018622063118472134, "loss": 0.3733, "step": 59 }, { "epoch": 0.7741935483870968, "grad_norm": 0.08576290382509591, "learning_rate": 0.00018544194045464886, "loss": 0.3878, "step": 60 }, { "epoch": 0.7870967741935484, "grad_norm": 0.0844142047859298, "learning_rate": 0.00018464356987288013, "loss": 0.3637, "step": 61 }, { "epoch": 0.8, "grad_norm": 0.08918487261557899, "learning_rate": 0.00018382570332820043, "loss": 0.3775, "step": 62 }, { "epoch": 0.8129032258064516, "grad_norm": 0.0795181880669878, "learning_rate": 0.00018298852919990252, "loss": 0.3853, "step": 63 }, { "epoch": 0.8258064516129032, "grad_norm": 0.08173055996583302, "learning_rate": 0.0001821322403143969, "loss": 0.38, "step": 64 }, { "epoch": 0.8387096774193549, "grad_norm": 0.08525070031165603, "learning_rate": 0.0001812570339007983, "loss": 0.3778, "step": 65 }, { "epoch": 0.8516129032258064, "grad_norm": 0.08531235204546653, "learning_rate": 0.00018036311154549784, "loss": 0.3727, "step": 66 }, { "epoch": 0.864516129032258, "grad_norm": 0.08169851479895494, "learning_rate": 0.00017945067914573146, "loss": 0.365, "step": 67 }, { "epoch": 0.8774193548387097, "grad_norm": 0.08463789046916101, "learning_rate": 0.0001785199468621559, "loss": 0.3752, "step": 68 }, { "epoch": 0.8903225806451613, "grad_norm": 0.09441843624235378, "learning_rate": 0.000177571129070442, "loss": 0.3665, "step": 69 }, { "epoch": 0.9032258064516129, "grad_norm": 0.08530939476149231, "learning_rate": 0.0001766044443118978, "loss": 0.3926, "step": 70 }, { "epoch": 0.9161290322580645, "grad_norm": 0.0836606457284625, "learning_rate": 0.00017562011524313185, "loss": 0.3844, "step": 71 }, { "epoch": 0.9290322580645162, "grad_norm": 0.09868625782773943, "learning_rate": 0.00017461836858476856, "loss": 0.3835, "step": 72 }, { "epoch": 0.9419354838709677, "grad_norm": 0.082132336261239, "learning_rate": 0.00017359943506922774, "loss": 0.3792, "step": 73 }, { "epoch": 0.9548387096774194, "grad_norm": 0.08948965393301354, "learning_rate": 0.0001725635493875799, "loss": 0.3813, "step": 74 }, { "epoch": 0.967741935483871, "grad_norm": 0.08539410389371488, "learning_rate": 0.00017151095013548994, "loss": 0.3774, "step": 75 }, { "epoch": 0.9806451612903225, "grad_norm": 0.08690404790165682, "learning_rate": 0.00017044187975826124, "loss": 0.3762, "step": 76 }, { "epoch": 0.9935483870967742, "grad_norm": 0.09039522496805455, "learning_rate": 0.0001693565844949933, "loss": 0.3733, "step": 77 }, { "epoch": 0.9935483870967742, "eval_loss": 0.3743511736392975, "eval_runtime": 42.1339, "eval_samples_per_second": 24.66, "eval_steps_per_second": 0.783, "step": 77 }, { "epoch": 1.0064516129032257, "grad_norm": 0.09165665911792642, "learning_rate": 0.00016825531432186543, "loss": 0.3532, "step": 78 }, { "epoch": 1.0193548387096774, "grad_norm": 0.0801922544260219, "learning_rate": 0.0001671383228945597, "loss": 0.347, "step": 79 }, { "epoch": 1.032258064516129, "grad_norm": 0.08352186065175837, "learning_rate": 0.00016600586748983641, "loss": 0.3566, "step": 80 }, { "epoch": 1.0451612903225806, "grad_norm": 0.08793176795367076, "learning_rate": 0.0001648582089462756, "loss": 0.3473, "step": 81 }, { "epoch": 1.0580645161290323, "grad_norm": 0.08913951531063671, "learning_rate": 0.00016369561160419784, "loss": 0.342, "step": 82 }, { "epoch": 1.070967741935484, "grad_norm": 0.08309712335786672, "learning_rate": 0.0001625183432447789, "loss": 0.345, "step": 83 }, { "epoch": 1.0838709677419356, "grad_norm": 0.08725330804483407, "learning_rate": 0.00016132667502837165, "loss": 0.3523, "step": 84 }, { "epoch": 1.096774193548387, "grad_norm": 0.08680862762413778, "learning_rate": 0.00016012088143204953, "loss": 0.3554, "step": 85 }, { "epoch": 1.1096774193548387, "grad_norm": 0.0863782848559528, "learning_rate": 0.00015890124018638638, "loss": 0.364, "step": 86 }, { "epoch": 1.1225806451612903, "grad_norm": 0.08388848992116194, "learning_rate": 0.00015766803221148673, "loss": 0.3568, "step": 87 }, { "epoch": 1.135483870967742, "grad_norm": 0.08226994751114965, "learning_rate": 0.00015642154155228122, "loss": 0.3489, "step": 88 }, { "epoch": 1.1483870967741936, "grad_norm": 0.08575965994905438, "learning_rate": 0.00015516205531310273, "loss": 0.3466, "step": 89 }, { "epoch": 1.1612903225806452, "grad_norm": 0.0895747440427046, "learning_rate": 0.00015388986359155758, "loss": 0.3488, "step": 90 }, { "epoch": 1.1741935483870969, "grad_norm": 0.08403222320010312, "learning_rate": 0.00015260525941170712, "loss": 0.356, "step": 91 }, { "epoch": 1.1870967741935483, "grad_norm": 0.08627434364043794, "learning_rate": 0.0001513085386565758, "loss": 0.3519, "step": 92 }, { "epoch": 1.2, "grad_norm": 0.08925414655300028, "learning_rate": 0.00015000000000000001, "loss": 0.3523, "step": 93 }, { "epoch": 1.2129032258064516, "grad_norm": 0.09120079741968923, "learning_rate": 0.00014867994483783485, "loss": 0.3555, "step": 94 }, { "epoch": 1.2258064516129032, "grad_norm": 0.08519037826685563, "learning_rate": 0.0001473486772185334, "loss": 0.3551, "step": 95 }, { "epoch": 1.238709677419355, "grad_norm": 0.08814591743170447, "learning_rate": 0.00014600650377311522, "loss": 0.3535, "step": 96 }, { "epoch": 1.2516129032258063, "grad_norm": 0.08812877093082108, "learning_rate": 0.00014465373364454001, "loss": 0.3498, "step": 97 }, { "epoch": 1.2645161290322582, "grad_norm": 0.08596197743921638, "learning_rate": 0.00014329067841650274, "loss": 0.3484, "step": 98 }, { "epoch": 1.2774193548387096, "grad_norm": 0.09025513346881896, "learning_rate": 0.00014191765204166643, "loss": 0.3465, "step": 99 }, { "epoch": 1.2903225806451613, "grad_norm": 0.08665409616008209, "learning_rate": 0.00014053497076934948, "loss": 0.35, "step": 100 }, { "epoch": 1.303225806451613, "grad_norm": 0.09012608398761074, "learning_rate": 0.00013914295307268396, "loss": 0.3516, "step": 101 }, { "epoch": 1.3161290322580645, "grad_norm": 0.09456407877563842, "learning_rate": 0.00013774191957526143, "loss": 0.3639, "step": 102 }, { "epoch": 1.3290322580645162, "grad_norm": 0.0888376260234129, "learning_rate": 0.00013633219297728416, "loss": 0.3396, "step": 103 }, { "epoch": 1.3419354838709676, "grad_norm": 0.08652600639054038, "learning_rate": 0.00013491409798123687, "loss": 0.3445, "step": 104 }, { "epoch": 1.3548387096774195, "grad_norm": 0.09269194410505097, "learning_rate": 0.00013348796121709862, "loss": 0.3555, "step": 105 }, { "epoch": 1.367741935483871, "grad_norm": 0.09421096011594207, "learning_rate": 0.00013205411116710972, "loss": 0.3508, "step": 106 }, { "epoch": 1.3806451612903226, "grad_norm": 0.09286783444235318, "learning_rate": 0.00013061287809011242, "loss": 0.3571, "step": 107 }, { "epoch": 1.3935483870967742, "grad_norm": 0.08172852976047028, "learning_rate": 0.0001291645939454825, "loss": 0.3488, "step": 108 }, { "epoch": 1.4064516129032258, "grad_norm": 0.09033973727962885, "learning_rate": 0.0001277095923166689, "loss": 0.3498, "step": 109 }, { "epoch": 1.4193548387096775, "grad_norm": 0.09628933362833343, "learning_rate": 0.00012624820833435937, "loss": 0.3472, "step": 110 }, { "epoch": 1.432258064516129, "grad_norm": 0.08471497514674803, "learning_rate": 0.00012478077859929, "loss": 0.3353, "step": 111 }, { "epoch": 1.4451612903225808, "grad_norm": 0.08976133324522119, "learning_rate": 0.00012330764110471566, "loss": 0.3468, "step": 112 }, { "epoch": 1.4580645161290322, "grad_norm": 0.09634877556737409, "learning_rate": 0.00012182913515856015, "loss": 0.3541, "step": 113 }, { "epoch": 1.4709677419354839, "grad_norm": 0.09348923296138459, "learning_rate": 0.0001203456013052634, "loss": 0.3521, "step": 114 }, { "epoch": 1.4838709677419355, "grad_norm": 0.09437711091684706, "learning_rate": 0.00011885738124734358, "loss": 0.3566, "step": 115 }, { "epoch": 1.4967741935483871, "grad_norm": 0.08916702937111011, "learning_rate": 0.00011736481776669306, "loss": 0.3458, "step": 116 }, { "epoch": 1.5096774193548388, "grad_norm": 0.09100601467580355, "learning_rate": 0.00011586825464562514, "loss": 0.3593, "step": 117 }, { "epoch": 1.5225806451612902, "grad_norm": 0.08990470683690902, "learning_rate": 0.00011436803658769082, "loss": 0.3434, "step": 118 }, { "epoch": 1.535483870967742, "grad_norm": 0.0932653393737011, "learning_rate": 0.00011286450913828312, "loss": 0.342, "step": 119 }, { "epoch": 1.5483870967741935, "grad_norm": 0.08960531773257623, "learning_rate": 0.00011135801860504749, "loss": 0.3628, "step": 120 }, { "epoch": 1.5612903225806452, "grad_norm": 0.09275069273094473, "learning_rate": 0.00010984891197811687, "loss": 0.3513, "step": 121 }, { "epoch": 1.5741935483870968, "grad_norm": 0.09527469311088294, "learning_rate": 0.00010833753685018935, "loss": 0.3556, "step": 122 }, { "epoch": 1.5870967741935482, "grad_norm": 0.09323849659154124, "learning_rate": 0.0001068242413364671, "loss": 0.3448, "step": 123 }, { "epoch": 1.6, "grad_norm": 0.08474554028292876, "learning_rate": 0.00010530937399447496, "loss": 0.3499, "step": 124 }, { "epoch": 1.6129032258064515, "grad_norm": 0.09382059811382143, "learning_rate": 0.00010379328374377715, "loss": 0.3384, "step": 125 }, { "epoch": 1.6258064516129034, "grad_norm": 0.09276702527842776, "learning_rate": 0.00010227631978561056, "loss": 0.3444, "step": 126 }, { "epoch": 1.6387096774193548, "grad_norm": 0.08750152088472078, "learning_rate": 0.00010075883152245334, "loss": 0.3569, "step": 127 }, { "epoch": 1.6516129032258065, "grad_norm": 0.08714445180642569, "learning_rate": 9.92411684775467e-05, "loss": 0.342, "step": 128 }, { "epoch": 1.664516129032258, "grad_norm": 0.08469902272466831, "learning_rate": 9.772368021438943e-05, "loss": 0.3342, "step": 129 }, { "epoch": 1.6774193548387095, "grad_norm": 0.08724585745005611, "learning_rate": 9.620671625622288e-05, "loss": 0.3335, "step": 130 }, { "epoch": 1.6903225806451614, "grad_norm": 0.09087336723016343, "learning_rate": 9.469062600552509e-05, "loss": 0.3447, "step": 131 }, { "epoch": 1.7032258064516128, "grad_norm": 0.08863278083042062, "learning_rate": 9.317575866353292e-05, "loss": 0.3487, "step": 132 }, { "epoch": 1.7161290322580647, "grad_norm": 0.08343459715762, "learning_rate": 9.166246314981066e-05, "loss": 0.3454, "step": 133 }, { "epoch": 1.729032258064516, "grad_norm": 0.08837483796029806, "learning_rate": 9.015108802188313e-05, "loss": 0.3484, "step": 134 }, { "epoch": 1.7419354838709677, "grad_norm": 0.08762249376974672, "learning_rate": 8.86419813949525e-05, "loss": 0.3447, "step": 135 }, { "epoch": 1.7548387096774194, "grad_norm": 0.08446853010895118, "learning_rate": 8.713549086171691e-05, "loss": 0.3466, "step": 136 }, { "epoch": 1.7677419354838708, "grad_norm": 0.08897676787603495, "learning_rate": 8.563196341230919e-05, "loss": 0.3434, "step": 137 }, { "epoch": 1.7806451612903227, "grad_norm": 0.09210810174866911, "learning_rate": 8.413174535437487e-05, "loss": 0.355, "step": 138 }, { "epoch": 1.793548387096774, "grad_norm": 0.0877098792555575, "learning_rate": 8.263518223330697e-05, "loss": 0.3392, "step": 139 }, { "epoch": 1.8064516129032258, "grad_norm": 0.09059259587839792, "learning_rate": 8.114261875265643e-05, "loss": 0.3465, "step": 140 }, { "epoch": 1.8193548387096774, "grad_norm": 0.09043152099082513, "learning_rate": 7.965439869473664e-05, "loss": 0.3409, "step": 141 }, { "epoch": 1.832258064516129, "grad_norm": 0.08863483273837267, "learning_rate": 7.817086484143986e-05, "loss": 0.3497, "step": 142 }, { "epoch": 1.8451612903225807, "grad_norm": 0.08351509862847174, "learning_rate": 7.669235889528436e-05, "loss": 0.3484, "step": 143 }, { "epoch": 1.8580645161290321, "grad_norm": 0.08881689002413959, "learning_rate": 7.521922140071002e-05, "loss": 0.3428, "step": 144 }, { "epoch": 1.870967741935484, "grad_norm": 0.08962413300366581, "learning_rate": 7.375179166564063e-05, "loss": 0.3353, "step": 145 }, { "epoch": 1.8838709677419354, "grad_norm": 0.08991947191225944, "learning_rate": 7.229040768333115e-05, "loss": 0.3366, "step": 146 }, { "epoch": 1.896774193548387, "grad_norm": 0.0890545628104281, "learning_rate": 7.08354060545175e-05, "loss": 0.3381, "step": 147 }, { "epoch": 1.9096774193548387, "grad_norm": 0.09306016588414409, "learning_rate": 6.93871219098876e-05, "loss": 0.3356, "step": 148 }, { "epoch": 1.9225806451612903, "grad_norm": 0.08816048934545212, "learning_rate": 6.79458888328903e-05, "loss": 0.3412, "step": 149 }, { "epoch": 1.935483870967742, "grad_norm": 0.09006593042575502, "learning_rate": 6.651203878290139e-05, "loss": 0.3471, "step": 150 }, { "epoch": 1.9483870967741934, "grad_norm": 0.08499237638300171, "learning_rate": 6.508590201876317e-05, "loss": 0.335, "step": 151 }, { "epoch": 1.9612903225806453, "grad_norm": 0.09566747308379261, "learning_rate": 6.366780702271589e-05, "loss": 0.3395, "step": 152 }, { "epoch": 1.9741935483870967, "grad_norm": 0.0915253754596643, "learning_rate": 6.225808042473858e-05, "loss": 0.3488, "step": 153 }, { "epoch": 1.9870967741935484, "grad_norm": 0.08657357278603872, "learning_rate": 6.085704692731609e-05, "loss": 0.3344, "step": 154 }, { "epoch": 2.0, "grad_norm": 0.08950726731743963, "learning_rate": 5.9465029230650534e-05, "loss": 0.33, "step": 155 }, { "epoch": 2.0, "eval_loss": 0.35439133644104004, "eval_runtime": 36.1469, "eval_samples_per_second": 28.744, "eval_steps_per_second": 0.913, "step": 155 }, { "epoch": 2.0129032258064514, "grad_norm": 0.08961232668946545, "learning_rate": 5.8082347958333625e-05, "loss": 0.3273, "step": 156 }, { "epoch": 2.0258064516129033, "grad_norm": 0.09402916213349197, "learning_rate": 5.670932158349731e-05, "loss": 0.3218, "step": 157 }, { "epoch": 2.0387096774193547, "grad_norm": 0.08520247695821515, "learning_rate": 5.5346266355459995e-05, "loss": 0.3089, "step": 158 }, { "epoch": 2.0516129032258066, "grad_norm": 0.08637288183919145, "learning_rate": 5.399349622688479e-05, "loss": 0.3266, "step": 159 }, { "epoch": 2.064516129032258, "grad_norm": 0.08823864345930746, "learning_rate": 5.26513227814666e-05, "loss": 0.329, "step": 160 }, { "epoch": 2.07741935483871, "grad_norm": 0.09384371931382793, "learning_rate": 5.1320055162165115e-05, "loss": 0.3275, "step": 161 }, { "epoch": 2.0903225806451613, "grad_norm": 0.09516405744887674, "learning_rate": 5.000000000000002e-05, "loss": 0.332, "step": 162 }, { "epoch": 2.1032258064516127, "grad_norm": 0.08966279182804247, "learning_rate": 4.869146134342426e-05, "loss": 0.3247, "step": 163 }, { "epoch": 2.1161290322580646, "grad_norm": 0.08700940402163973, "learning_rate": 4.739474058829289e-05, "loss": 0.3221, "step": 164 }, { "epoch": 2.129032258064516, "grad_norm": 0.08984677102800173, "learning_rate": 4.611013640844245e-05, "loss": 0.3272, "step": 165 }, { "epoch": 2.141935483870968, "grad_norm": 0.08964202186304891, "learning_rate": 4.483794468689728e-05, "loss": 0.3188, "step": 166 }, { "epoch": 2.1548387096774193, "grad_norm": 0.09997697429798251, "learning_rate": 4.357845844771881e-05, "loss": 0.3383, "step": 167 }, { "epoch": 2.167741935483871, "grad_norm": 0.09510073376177604, "learning_rate": 4.2331967788513295e-05, "loss": 0.3252, "step": 168 }, { "epoch": 2.1806451612903226, "grad_norm": 0.09107612709336496, "learning_rate": 4.109875981361363e-05, "loss": 0.3217, "step": 169 }, { "epoch": 2.193548387096774, "grad_norm": 0.08804927379783276, "learning_rate": 3.987911856795047e-05, "loss": 0.3173, "step": 170 }, { "epoch": 2.206451612903226, "grad_norm": 0.0916081059987062, "learning_rate": 3.8673324971628357e-05, "loss": 0.3285, "step": 171 }, { "epoch": 2.2193548387096773, "grad_norm": 0.09226628432750343, "learning_rate": 3.7481656755221125e-05, "loss": 0.3154, "step": 172 }, { "epoch": 2.232258064516129, "grad_norm": 0.09145015878266409, "learning_rate": 3.630438839580217e-05, "loss": 0.3087, "step": 173 }, { "epoch": 2.2451612903225806, "grad_norm": 0.08786201399591659, "learning_rate": 3.5141791053724405e-05, "loss": 0.3151, "step": 174 }, { "epoch": 2.258064516129032, "grad_norm": 0.09259402512083086, "learning_rate": 3.399413251016359e-05, "loss": 0.3369, "step": 175 }, { "epoch": 2.270967741935484, "grad_norm": 0.09311260751337232, "learning_rate": 3.2861677105440336e-05, "loss": 0.3051, "step": 176 }, { "epoch": 2.2838709677419353, "grad_norm": 0.09217712904693832, "learning_rate": 3.174468567813461e-05, "loss": 0.3199, "step": 177 }, { "epoch": 2.296774193548387, "grad_norm": 0.09141877592974519, "learning_rate": 3.0643415505006735e-05, "loss": 0.3229, "step": 178 }, { "epoch": 2.3096774193548386, "grad_norm": 0.09528833689903496, "learning_rate": 2.9558120241738784e-05, "loss": 0.3286, "step": 179 }, { "epoch": 2.3225806451612905, "grad_norm": 0.09070636787107308, "learning_rate": 2.8489049864510054e-05, "loss": 0.3348, "step": 180 }, { "epoch": 2.335483870967742, "grad_norm": 0.09307512327341362, "learning_rate": 2.7436450612420095e-05, "loss": 0.3256, "step": 181 }, { "epoch": 2.3483870967741938, "grad_norm": 0.09127823479306682, "learning_rate": 2.640056493077231e-05, "loss": 0.3181, "step": 182 }, { "epoch": 2.361290322580645, "grad_norm": 0.09246009256113925, "learning_rate": 2.5381631415231454e-05, "loss": 0.3391, "step": 183 }, { "epoch": 2.3741935483870966, "grad_norm": 0.09095352379758655, "learning_rate": 2.4379884756868167e-05, "loss": 0.3172, "step": 184 }, { "epoch": 2.3870967741935485, "grad_norm": 0.0926880163626768, "learning_rate": 2.339555568810221e-05, "loss": 0.3177, "step": 185 }, { "epoch": 2.4, "grad_norm": 0.09094474131194094, "learning_rate": 2.242887092955801e-05, "loss": 0.3199, "step": 186 }, { "epoch": 2.412903225806452, "grad_norm": 0.09106546035353981, "learning_rate": 2.1480053137844115e-05, "loss": 0.3222, "step": 187 }, { "epoch": 2.425806451612903, "grad_norm": 0.08873018715134598, "learning_rate": 2.054932085426856e-05, "loss": 0.3118, "step": 188 }, { "epoch": 2.4387096774193546, "grad_norm": 0.0932765377498955, "learning_rate": 1.9636888454502178e-05, "loss": 0.3358, "step": 189 }, { "epoch": 2.4516129032258065, "grad_norm": 0.09181586534157822, "learning_rate": 1.8742966099201697e-05, "loss": 0.3157, "step": 190 }, { "epoch": 2.464516129032258, "grad_norm": 0.0929486436457203, "learning_rate": 1.7867759685603114e-05, "loss": 0.3154, "step": 191 }, { "epoch": 2.47741935483871, "grad_norm": 0.09188630220285351, "learning_rate": 1.7011470800097496e-05, "loss": 0.3181, "step": 192 }, { "epoch": 2.490322580645161, "grad_norm": 0.09574286894431329, "learning_rate": 1.6174296671799572e-05, "loss": 0.3222, "step": 193 }, { "epoch": 2.5032258064516126, "grad_norm": 0.09145354457132104, "learning_rate": 1.5356430127119913e-05, "loss": 0.3222, "step": 194 }, { "epoch": 2.5161290322580645, "grad_norm": 0.09039580690260736, "learning_rate": 1.4558059545351143e-05, "loss": 0.324, "step": 195 }, { "epoch": 2.5290322580645164, "grad_norm": 0.08979381831653434, "learning_rate": 1.3779368815278647e-05, "loss": 0.3107, "step": 196 }, { "epoch": 2.541935483870968, "grad_norm": 0.09526292697431937, "learning_rate": 1.302053729282533e-05, "loss": 0.3219, "step": 197 }, { "epoch": 2.554838709677419, "grad_norm": 0.09310358146453943, "learning_rate": 1.2281739759740574e-05, "loss": 0.3214, "step": 198 }, { "epoch": 2.567741935483871, "grad_norm": 0.09212645063531479, "learning_rate": 1.1563146383342772e-05, "loss": 0.3154, "step": 199 }, { "epoch": 2.5806451612903225, "grad_norm": 0.09533681862557382, "learning_rate": 1.0864922677324618e-05, "loss": 0.319, "step": 200 }, { "epoch": 2.5935483870967744, "grad_norm": 0.09551418366783314, "learning_rate": 1.01872294636304e-05, "loss": 0.3333, "step": 201 }, { "epoch": 2.606451612903226, "grad_norm": 0.08930212325894361, "learning_rate": 9.530222835413738e-06, "loss": 0.3048, "step": 202 }, { "epoch": 2.6193548387096772, "grad_norm": 0.09220378121771236, "learning_rate": 8.894054121084838e-06, "loss": 0.3146, "step": 203 }, { "epoch": 2.632258064516129, "grad_norm": 0.09150774720724307, "learning_rate": 8.278869849454718e-06, "loss": 0.3311, "step": 204 }, { "epoch": 2.6451612903225805, "grad_norm": 0.09261513270619316, "learning_rate": 7.684811715985429e-06, "loss": 0.3172, "step": 205 }, { "epoch": 2.6580645161290324, "grad_norm": 0.0941004102909483, "learning_rate": 7.1120165501533e-06, "loss": 0.3347, "step": 206 }, { "epoch": 2.670967741935484, "grad_norm": 0.08707518610128166, "learning_rate": 6.560616283932897e-06, "loss": 0.3116, "step": 207 }, { "epoch": 2.6838709677419352, "grad_norm": 0.08648707636296159, "learning_rate": 6.030737921409169e-06, "loss": 0.3144, "step": 208 }, { "epoch": 2.696774193548387, "grad_norm": 0.09169150101119816, "learning_rate": 5.52250350952459e-06, "loss": 0.3255, "step": 209 }, { "epoch": 2.709677419354839, "grad_norm": 0.09060072523264334, "learning_rate": 5.036030109968082e-06, "loss": 0.3183, "step": 210 }, { "epoch": 2.7225806451612904, "grad_norm": 0.09077216490604942, "learning_rate": 4.5714297722121106e-06, "loss": 0.321, "step": 211 }, { "epoch": 2.735483870967742, "grad_norm": 0.09088968433443333, "learning_rate": 4.128809507704445e-06, "loss": 0.3172, "step": 212 }, { "epoch": 2.7483870967741937, "grad_norm": 0.09191902683388614, "learning_rate": 3.7082712652200867e-06, "loss": 0.3261, "step": 213 }, { "epoch": 2.761290322580645, "grad_norm": 0.08843215800144302, "learning_rate": 3.3099119073793928e-06, "loss": 0.3158, "step": 214 }, { "epoch": 2.774193548387097, "grad_norm": 0.09079938334868655, "learning_rate": 2.9338231883378366e-06, "loss": 0.3178, "step": 215 }, { "epoch": 2.7870967741935484, "grad_norm": 0.09122789808454786, "learning_rate": 2.580091732652101e-06, "loss": 0.3282, "step": 216 }, { "epoch": 2.8, "grad_norm": 0.09380292374109117, "learning_rate": 2.248799015327907e-06, "loss": 0.3359, "step": 217 }, { "epoch": 2.8129032258064517, "grad_norm": 0.09035917420929797, "learning_rate": 1.9400213430538773e-06, "loss": 0.3169, "step": 218 }, { "epoch": 2.825806451612903, "grad_norm": 0.09195121657817087, "learning_rate": 1.6538298366257976e-06, "loss": 0.3314, "step": 219 }, { "epoch": 2.838709677419355, "grad_norm": 0.09166102367139951, "learning_rate": 1.3902904145653096e-06, "loss": 0.3258, "step": 220 }, { "epoch": 2.8516129032258064, "grad_norm": 0.0921992572010057, "learning_rate": 1.1494637779369766e-06, "loss": 0.3298, "step": 221 }, { "epoch": 2.864516129032258, "grad_norm": 0.09068261067988724, "learning_rate": 9.314053963669245e-07, "loss": 0.3214, "step": 222 }, { "epoch": 2.8774193548387097, "grad_norm": 0.09417924199778298, "learning_rate": 7.361654952665609e-07, "loss": 0.3134, "step": 223 }, { "epoch": 2.8903225806451616, "grad_norm": 0.0901765977296441, "learning_rate": 5.637890442641402e-07, "loss": 0.3221, "step": 224 }, { "epoch": 2.903225806451613, "grad_norm": 0.09094506589085496, "learning_rate": 4.143157468468717e-07, "loss": 0.3128, "step": 225 }, { "epoch": 2.9161290322580644, "grad_norm": 0.08772549933058231, "learning_rate": 2.877800312160783e-07, "loss": 0.3248, "step": 226 }, { "epoch": 2.9290322580645163, "grad_norm": 0.09191883931659987, "learning_rate": 1.8421104235727405e-07, "loss": 0.3114, "step": 227 }, { "epoch": 2.9419354838709677, "grad_norm": 0.08876137430429, "learning_rate": 1.0363263532724432e-07, "loss": 0.3127, "step": 228 }, { "epoch": 2.9548387096774196, "grad_norm": 0.09157045134043748, "learning_rate": 4.606336975948589e-08, "loss": 0.3275, "step": 229 }, { "epoch": 2.967741935483871, "grad_norm": 0.08940213355520302, "learning_rate": 1.1516505589381776e-08, "loss": 0.3246, "step": 230 }, { "epoch": 2.9806451612903224, "grad_norm": 0.0895898052255747, "learning_rate": 0.0, "loss": 0.3079, "step": 231 }, { "epoch": 2.9806451612903224, "eval_loss": 0.3507891595363617, "eval_runtime": 36.0777, "eval_samples_per_second": 28.799, "eval_steps_per_second": 0.915, "step": 231 }, { "epoch": 2.9806451612903224, "step": 231, "total_flos": 9.324729662937498e+16, "train_loss": 0.3951803825118325, "train_runtime": 2997.4381, "train_samples_per_second": 9.871, "train_steps_per_second": 0.077 } ], "logging_steps": 1, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.324729662937498e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }