{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 91, "global_step": 362, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027624309392265192, "grad_norm": 0.42503952980041504, "learning_rate": 2.0000000000000003e-06, "loss": 1.7314, "step": 1 }, { "epoch": 0.0027624309392265192, "eval_loss": 1.3109512329101562, "eval_runtime": 138.9562, "eval_samples_per_second": 10.241, "eval_steps_per_second": 0.324, "step": 1 }, { "epoch": 0.0055248618784530384, "grad_norm": 0.4957458972930908, "learning_rate": 4.000000000000001e-06, "loss": 1.8536, "step": 2 }, { "epoch": 0.008287292817679558, "grad_norm": 0.4254479706287384, "learning_rate": 6e-06, "loss": 1.7545, "step": 3 }, { "epoch": 0.011049723756906077, "grad_norm": 0.4761411249637604, "learning_rate": 8.000000000000001e-06, "loss": 1.7814, "step": 4 }, { "epoch": 0.013812154696132596, "grad_norm": 0.47434934973716736, "learning_rate": 1e-05, "loss": 1.7703, "step": 5 }, { "epoch": 0.016574585635359115, "grad_norm": 0.4699034094810486, "learning_rate": 1.2e-05, "loss": 1.83, "step": 6 }, { "epoch": 0.019337016574585635, "grad_norm": 0.4335905909538269, "learning_rate": 1.4e-05, "loss": 1.6648, "step": 7 }, { "epoch": 0.022099447513812154, "grad_norm": 0.4264945685863495, "learning_rate": 1.6000000000000003e-05, "loss": 1.7164, "step": 8 }, { "epoch": 0.024861878453038673, "grad_norm": 0.417251318693161, "learning_rate": 1.8e-05, "loss": 1.7058, "step": 9 }, { "epoch": 0.027624309392265192, "grad_norm": 0.42205414175987244, "learning_rate": 2e-05, "loss": 1.71, "step": 10 }, { "epoch": 0.03038674033149171, "grad_norm": 0.414096862077713, "learning_rate": 1.9999601726381415e-05, "loss": 1.6954, "step": 11 }, { "epoch": 0.03314917127071823, "grad_norm": 0.412890762090683, "learning_rate": 1.9998406937250035e-05, "loss": 1.6941, "step": 12 }, { "epoch": 0.03591160220994475, "grad_norm": 0.4428125321865082, "learning_rate": 1.9996415727776456e-05, "loss": 1.6439, "step": 13 }, { "epoch": 0.03867403314917127, "grad_norm": 0.4378657341003418, "learning_rate": 1.999362825656992e-05, "loss": 1.7518, "step": 14 }, { "epoch": 0.04143646408839779, "grad_norm": 0.4670448303222656, "learning_rate": 1.9990044745665672e-05, "loss": 1.6461, "step": 15 }, { "epoch": 0.04419889502762431, "grad_norm": 0.44707468152046204, "learning_rate": 1.998566548050729e-05, "loss": 1.7201, "step": 16 }, { "epoch": 0.04696132596685083, "grad_norm": 0.4317484200000763, "learning_rate": 1.9980490809923928e-05, "loss": 1.7697, "step": 17 }, { "epoch": 0.049723756906077346, "grad_norm": 0.390918105840683, "learning_rate": 1.9974521146102535e-05, "loss": 1.6336, "step": 18 }, { "epoch": 0.052486187845303865, "grad_norm": 0.44283199310302734, "learning_rate": 1.9967756964555044e-05, "loss": 1.7058, "step": 19 }, { "epoch": 0.055248618784530384, "grad_norm": 0.40721943974494934, "learning_rate": 1.9960198804080462e-05, "loss": 1.7619, "step": 20 }, { "epoch": 0.058011049723756904, "grad_norm": 0.3747328817844391, "learning_rate": 1.995184726672197e-05, "loss": 1.6998, "step": 21 }, { "epoch": 0.06077348066298342, "grad_norm": 0.3749975860118866, "learning_rate": 1.9942703017718977e-05, "loss": 1.7488, "step": 22 }, { "epoch": 0.06353591160220995, "grad_norm": 0.36129963397979736, "learning_rate": 1.99327667854541e-05, "loss": 1.6649, "step": 23 }, { "epoch": 0.06629834254143646, "grad_norm": 0.3862815797328949, "learning_rate": 1.9922039361395186e-05, "loss": 1.626, "step": 24 }, { "epoch": 0.06906077348066299, "grad_norm": 0.4051111042499542, "learning_rate": 1.991052160003223e-05, "loss": 1.6523, "step": 25 }, { "epoch": 0.0718232044198895, "grad_norm": 0.42131727933883667, "learning_rate": 1.989821441880933e-05, "loss": 1.8082, "step": 26 }, { "epoch": 0.07458563535911603, "grad_norm": 0.39152222871780396, "learning_rate": 1.9885118798051607e-05, "loss": 1.6831, "step": 27 }, { "epoch": 0.07734806629834254, "grad_norm": 0.40526384115219116, "learning_rate": 1.9871235780887114e-05, "loss": 1.6613, "step": 28 }, { "epoch": 0.08011049723756906, "grad_norm": 0.3861314654350281, "learning_rate": 1.9856566473163747e-05, "loss": 1.6981, "step": 29 }, { "epoch": 0.08287292817679558, "grad_norm": 0.38509440422058105, "learning_rate": 1.984111204336116e-05, "loss": 1.6309, "step": 30 }, { "epoch": 0.0856353591160221, "grad_norm": 0.3785051107406616, "learning_rate": 1.9824873722497694e-05, "loss": 1.675, "step": 31 }, { "epoch": 0.08839779005524862, "grad_norm": 0.4013555645942688, "learning_rate": 1.9807852804032306e-05, "loss": 1.6033, "step": 32 }, { "epoch": 0.09116022099447514, "grad_norm": 0.36987578868865967, "learning_rate": 1.9790050643761552e-05, "loss": 1.643, "step": 33 }, { "epoch": 0.09392265193370165, "grad_norm": 0.34048470854759216, "learning_rate": 1.9771468659711595e-05, "loss": 1.7037, "step": 34 }, { "epoch": 0.09668508287292818, "grad_norm": 0.370815247297287, "learning_rate": 1.975210833202524e-05, "loss": 1.7189, "step": 35 }, { "epoch": 0.09944751381215469, "grad_norm": 0.3450065553188324, "learning_rate": 1.9731971202844036e-05, "loss": 1.6371, "step": 36 }, { "epoch": 0.10220994475138122, "grad_norm": 0.3642140328884125, "learning_rate": 1.9711058876185446e-05, "loss": 1.6962, "step": 37 }, { "epoch": 0.10497237569060773, "grad_norm": 0.3298833668231964, "learning_rate": 1.9689373017815076e-05, "loss": 1.5134, "step": 38 }, { "epoch": 0.10773480662983426, "grad_norm": 0.35403338074684143, "learning_rate": 1.9666915355113976e-05, "loss": 1.7167, "step": 39 }, { "epoch": 0.11049723756906077, "grad_norm": 0.36911338567733765, "learning_rate": 1.964368767694107e-05, "loss": 1.6674, "step": 40 }, { "epoch": 0.1132596685082873, "grad_norm": 0.38089630007743835, "learning_rate": 1.9619691833490645e-05, "loss": 1.5777, "step": 41 }, { "epoch": 0.11602209944751381, "grad_norm": 0.3290576934814453, "learning_rate": 1.9594929736144978e-05, "loss": 1.6851, "step": 42 }, { "epoch": 0.11878453038674033, "grad_norm": 0.3029409348964691, "learning_rate": 1.956940335732209e-05, "loss": 1.2519, "step": 43 }, { "epoch": 0.12154696132596685, "grad_norm": 0.3706592321395874, "learning_rate": 1.954311473031864e-05, "loss": 1.6061, "step": 44 }, { "epoch": 0.12430939226519337, "grad_norm": 0.36404719948768616, "learning_rate": 1.9516065949147945e-05, "loss": 1.5515, "step": 45 }, { "epoch": 0.1270718232044199, "grad_norm": 0.33177605271339417, "learning_rate": 1.9488259168373198e-05, "loss": 1.5692, "step": 46 }, { "epoch": 0.1298342541436464, "grad_norm": 0.3582829535007477, "learning_rate": 1.9459696602935838e-05, "loss": 1.5336, "step": 47 }, { "epoch": 0.13259668508287292, "grad_norm": 0.3565369248390198, "learning_rate": 1.9430380527979124e-05, "loss": 1.6251, "step": 48 }, { "epoch": 0.13535911602209943, "grad_norm": 0.3471137285232544, "learning_rate": 1.94003132786669e-05, "loss": 1.6245, "step": 49 }, { "epoch": 0.13812154696132597, "grad_norm": 0.4011210501194, "learning_rate": 1.936949724999762e-05, "loss": 1.3687, "step": 50 }, { "epoch": 0.1408839779005525, "grad_norm": 0.36058053374290466, "learning_rate": 1.9337934896613516e-05, "loss": 1.5283, "step": 51 }, { "epoch": 0.143646408839779, "grad_norm": 0.39054054021835327, "learning_rate": 1.930562873260514e-05, "loss": 1.5784, "step": 52 }, { "epoch": 0.1464088397790055, "grad_norm": 0.39804011583328247, "learning_rate": 1.927258133131105e-05, "loss": 1.605, "step": 53 }, { "epoch": 0.14917127071823205, "grad_norm": 0.42572247982025146, "learning_rate": 1.9238795325112867e-05, "loss": 1.4554, "step": 54 }, { "epoch": 0.15193370165745856, "grad_norm": 0.4022100269794464, "learning_rate": 1.9204273405225588e-05, "loss": 1.5905, "step": 55 }, { "epoch": 0.15469613259668508, "grad_norm": 0.43215224146842957, "learning_rate": 1.9169018321483198e-05, "loss": 1.6233, "step": 56 }, { "epoch": 0.1574585635359116, "grad_norm": 0.38549479842185974, "learning_rate": 1.9133032882119656e-05, "loss": 1.5868, "step": 57 }, { "epoch": 0.16022099447513813, "grad_norm": 0.42155200242996216, "learning_rate": 1.9096319953545186e-05, "loss": 1.4519, "step": 58 }, { "epoch": 0.16298342541436464, "grad_norm": 0.4292687177658081, "learning_rate": 1.9058882460117972e-05, "loss": 1.6053, "step": 59 }, { "epoch": 0.16574585635359115, "grad_norm": 0.44173118472099304, "learning_rate": 1.9020723383911214e-05, "loss": 1.6328, "step": 60 }, { "epoch": 0.1685082872928177, "grad_norm": 0.4508700966835022, "learning_rate": 1.8981845764475585e-05, "loss": 1.6, "step": 61 }, { "epoch": 0.1712707182320442, "grad_norm": 0.42624732851982117, "learning_rate": 1.8942252698597113e-05, "loss": 1.6086, "step": 62 }, { "epoch": 0.17403314917127072, "grad_norm": 0.4029814600944519, "learning_rate": 1.890194734005053e-05, "loss": 1.5955, "step": 63 }, { "epoch": 0.17679558011049723, "grad_norm": 0.4393555223941803, "learning_rate": 1.8860932899348028e-05, "loss": 1.5735, "step": 64 }, { "epoch": 0.17955801104972377, "grad_norm": 0.40672072768211365, "learning_rate": 1.881921264348355e-05, "loss": 1.5975, "step": 65 }, { "epoch": 0.18232044198895028, "grad_norm": 0.4309908151626587, "learning_rate": 1.8776789895672557e-05, "loss": 1.5539, "step": 66 }, { "epoch": 0.1850828729281768, "grad_norm": 0.41728851199150085, "learning_rate": 1.8733668035087302e-05, "loss": 1.5678, "step": 67 }, { "epoch": 0.1878453038674033, "grad_norm": 0.46532243490219116, "learning_rate": 1.8689850496587674e-05, "loss": 1.5277, "step": 68 }, { "epoch": 0.19060773480662985, "grad_norm": 0.43453872203826904, "learning_rate": 1.8645340770447595e-05, "loss": 1.5192, "step": 69 }, { "epoch": 0.19337016574585636, "grad_norm": 0.46961575746536255, "learning_rate": 1.8600142402077006e-05, "loss": 1.4962, "step": 70 }, { "epoch": 0.19613259668508287, "grad_norm": 0.47958558797836304, "learning_rate": 1.8554258991739454e-05, "loss": 1.502, "step": 71 }, { "epoch": 0.19889502762430938, "grad_norm": 0.4123697280883789, "learning_rate": 1.850769419426531e-05, "loss": 1.4997, "step": 72 }, { "epoch": 0.20165745856353592, "grad_norm": 0.5050204396247864, "learning_rate": 1.8460451718760653e-05, "loss": 1.5283, "step": 73 }, { "epoch": 0.20441988950276244, "grad_norm": 0.44690632820129395, "learning_rate": 1.8412535328311813e-05, "loss": 1.5429, "step": 74 }, { "epoch": 0.20718232044198895, "grad_norm": 0.45188507437705994, "learning_rate": 1.8363948839685638e-05, "loss": 1.532, "step": 75 }, { "epoch": 0.20994475138121546, "grad_norm": 0.4514811038970947, "learning_rate": 1.8314696123025456e-05, "loss": 1.4147, "step": 76 }, { "epoch": 0.212707182320442, "grad_norm": 0.5114113688468933, "learning_rate": 1.8264781101542797e-05, "loss": 1.5552, "step": 77 }, { "epoch": 0.2154696132596685, "grad_norm": 0.51650071144104, "learning_rate": 1.8214207751204917e-05, "loss": 1.5554, "step": 78 }, { "epoch": 0.21823204419889503, "grad_norm": 0.48588883876800537, "learning_rate": 1.816298010041806e-05, "loss": 1.5688, "step": 79 }, { "epoch": 0.22099447513812154, "grad_norm": 0.4869782328605652, "learning_rate": 1.8111102229706593e-05, "loss": 1.6062, "step": 80 }, { "epoch": 0.22375690607734808, "grad_norm": 0.4736042320728302, "learning_rate": 1.805857827138798e-05, "loss": 1.4834, "step": 81 }, { "epoch": 0.2265193370165746, "grad_norm": 0.5085079073905945, "learning_rate": 1.8005412409243604e-05, "loss": 1.4549, "step": 82 }, { "epoch": 0.2292817679558011, "grad_norm": 0.5268322229385376, "learning_rate": 1.7951608878185533e-05, "loss": 1.4762, "step": 83 }, { "epoch": 0.23204419889502761, "grad_norm": 0.5247999429702759, "learning_rate": 1.789717196391916e-05, "loss": 1.4625, "step": 84 }, { "epoch": 0.23480662983425415, "grad_norm": 0.5090611577033997, "learning_rate": 1.7842106002601854e-05, "loss": 1.4634, "step": 85 }, { "epoch": 0.23756906077348067, "grad_norm": 0.500877857208252, "learning_rate": 1.778641538049755e-05, "loss": 1.5259, "step": 86 }, { "epoch": 0.24033149171270718, "grad_norm": 0.5528935790061951, "learning_rate": 1.773010453362737e-05, "loss": 1.4783, "step": 87 }, { "epoch": 0.2430939226519337, "grad_norm": 0.4904051125049591, "learning_rate": 1.7673177947416258e-05, "loss": 1.5713, "step": 88 }, { "epoch": 0.24585635359116023, "grad_norm": 0.4856030344963074, "learning_rate": 1.7615640156335713e-05, "loss": 1.5448, "step": 89 }, { "epoch": 0.24861878453038674, "grad_norm": 0.4999338388442993, "learning_rate": 1.7557495743542586e-05, "loss": 1.5286, "step": 90 }, { "epoch": 0.2513812154696133, "grad_norm": 0.6124939322471619, "learning_rate": 1.749874934051401e-05, "loss": 1.5815, "step": 91 }, { "epoch": 0.2513812154696133, "eval_loss": 1.121329426765442, "eval_runtime": 153.7165, "eval_samples_per_second": 9.257, "eval_steps_per_second": 0.293, "step": 91 }, { "epoch": 0.2541436464088398, "grad_norm": 0.5473253726959229, "learning_rate": 1.7439405626678496e-05, "loss": 1.5358, "step": 92 }, { "epoch": 0.2569060773480663, "grad_norm": 0.515661895275116, "learning_rate": 1.7379469329043166e-05, "loss": 1.5705, "step": 93 }, { "epoch": 0.2596685082872928, "grad_norm": 0.5037718415260315, "learning_rate": 1.7318945221817255e-05, "loss": 1.5362, "step": 94 }, { "epoch": 0.26243093922651933, "grad_norm": 0.5043213367462158, "learning_rate": 1.7257838126031797e-05, "loss": 1.5082, "step": 95 }, { "epoch": 0.26519337016574585, "grad_norm": 0.5211421847343445, "learning_rate": 1.719615290915563e-05, "loss": 1.5486, "step": 96 }, { "epoch": 0.26795580110497236, "grad_norm": 0.5876896381378174, "learning_rate": 1.7133894484707657e-05, "loss": 1.4926, "step": 97 }, { "epoch": 0.27071823204419887, "grad_norm": 0.5891074538230896, "learning_rate": 1.7071067811865477e-05, "loss": 1.5895, "step": 98 }, { "epoch": 0.27348066298342544, "grad_norm": 0.539527952671051, "learning_rate": 1.7007677895070358e-05, "loss": 1.4588, "step": 99 }, { "epoch": 0.27624309392265195, "grad_norm": 0.4934506416320801, "learning_rate": 1.694372978362861e-05, "loss": 1.5116, "step": 100 }, { "epoch": 0.27900552486187846, "grad_norm": 0.5579091906547546, "learning_rate": 1.6879228571309377e-05, "loss": 1.4786, "step": 101 }, { "epoch": 0.281767955801105, "grad_norm": 0.5706738233566284, "learning_rate": 1.6814179395938915e-05, "loss": 1.549, "step": 102 }, { "epoch": 0.2845303867403315, "grad_norm": 0.5942355394363403, "learning_rate": 1.6748587438991303e-05, "loss": 1.4979, "step": 103 }, { "epoch": 0.287292817679558, "grad_norm": 0.5835041403770447, "learning_rate": 1.6682457925175762e-05, "loss": 1.4915, "step": 104 }, { "epoch": 0.2900552486187845, "grad_norm": 0.6035829782485962, "learning_rate": 1.6615796122020443e-05, "loss": 1.5754, "step": 105 }, { "epoch": 0.292817679558011, "grad_norm": 0.5598711967468262, "learning_rate": 1.6548607339452853e-05, "loss": 1.5017, "step": 106 }, { "epoch": 0.2955801104972376, "grad_norm": 0.5402417778968811, "learning_rate": 1.6480896929376905e-05, "loss": 1.5558, "step": 107 }, { "epoch": 0.2983425414364641, "grad_norm": 0.6131693720817566, "learning_rate": 1.641267028524661e-05, "loss": 1.532, "step": 108 }, { "epoch": 0.3011049723756906, "grad_norm": 0.5868387222290039, "learning_rate": 1.6343932841636455e-05, "loss": 1.4656, "step": 109 }, { "epoch": 0.30386740331491713, "grad_norm": 0.5688586831092834, "learning_rate": 1.627469007380852e-05, "loss": 1.5049, "step": 110 }, { "epoch": 0.30662983425414364, "grad_norm": 0.5797792673110962, "learning_rate": 1.6204947497276346e-05, "loss": 1.5484, "step": 111 }, { "epoch": 0.30939226519337015, "grad_norm": 0.5769445300102234, "learning_rate": 1.6134710667365598e-05, "loss": 1.4402, "step": 112 }, { "epoch": 0.31215469613259667, "grad_norm": 0.6333233118057251, "learning_rate": 1.6063985178771555e-05, "loss": 1.4678, "step": 113 }, { "epoch": 0.3149171270718232, "grad_norm": 0.558728814125061, "learning_rate": 1.599277666511347e-05, "loss": 1.4787, "step": 114 }, { "epoch": 0.31767955801104975, "grad_norm": 0.5599526166915894, "learning_rate": 1.592109079848583e-05, "loss": 1.5417, "step": 115 }, { "epoch": 0.32044198895027626, "grad_norm": 0.5926061272621155, "learning_rate": 1.584893328900653e-05, "loss": 1.3919, "step": 116 }, { "epoch": 0.32320441988950277, "grad_norm": 0.589717447757721, "learning_rate": 1.577630988436206e-05, "loss": 1.5362, "step": 117 }, { "epoch": 0.3259668508287293, "grad_norm": 0.6871376037597656, "learning_rate": 1.5703226369349642e-05, "loss": 1.4358, "step": 118 }, { "epoch": 0.3287292817679558, "grad_norm": 0.607738733291626, "learning_rate": 1.562968856541648e-05, "loss": 1.5095, "step": 119 }, { "epoch": 0.3314917127071823, "grad_norm": 0.635498583316803, "learning_rate": 1.5555702330196024e-05, "loss": 1.4874, "step": 120 }, { "epoch": 0.3342541436464088, "grad_norm": 0.6137527227401733, "learning_rate": 1.5481273557041402e-05, "loss": 1.4166, "step": 121 }, { "epoch": 0.3370165745856354, "grad_norm": 0.6075506210327148, "learning_rate": 1.5406408174555978e-05, "loss": 1.5638, "step": 122 }, { "epoch": 0.3397790055248619, "grad_norm": 0.6399998068809509, "learning_rate": 1.5331112146121104e-05, "loss": 1.503, "step": 123 }, { "epoch": 0.3425414364640884, "grad_norm": 0.5782871246337891, "learning_rate": 1.525539146942113e-05, "loss": 1.481, "step": 124 }, { "epoch": 0.3453038674033149, "grad_norm": 0.6390048265457153, "learning_rate": 1.5179252175965632e-05, "loss": 1.4298, "step": 125 }, { "epoch": 0.34806629834254144, "grad_norm": 0.5457322001457214, "learning_rate": 1.5102700330609e-05, "loss": 1.4868, "step": 126 }, { "epoch": 0.35082872928176795, "grad_norm": 0.5977615118026733, "learning_rate": 1.5025742031067316e-05, "loss": 1.4753, "step": 127 }, { "epoch": 0.35359116022099446, "grad_norm": 0.6722098588943481, "learning_rate": 1.4948383407432678e-05, "loss": 1.5022, "step": 128 }, { "epoch": 0.356353591160221, "grad_norm": 0.6676556468009949, "learning_rate": 1.4870630621684873e-05, "loss": 1.4862, "step": 129 }, { "epoch": 0.35911602209944754, "grad_norm": 0.6139523386955261, "learning_rate": 1.479248986720057e-05, "loss": 1.4543, "step": 130 }, { "epoch": 0.36187845303867405, "grad_norm": 0.621616542339325, "learning_rate": 1.4713967368259981e-05, "loss": 1.4795, "step": 131 }, { "epoch": 0.36464088397790057, "grad_norm": 0.6133718490600586, "learning_rate": 1.4635069379551054e-05, "loss": 1.4821, "step": 132 }, { "epoch": 0.3674033149171271, "grad_norm": 0.7033741474151611, "learning_rate": 1.4555802185671297e-05, "loss": 1.5079, "step": 133 }, { "epoch": 0.3701657458563536, "grad_norm": 0.6731663942337036, "learning_rate": 1.4476172100627127e-05, "loss": 1.4438, "step": 134 }, { "epoch": 0.3729281767955801, "grad_norm": 0.6156182885169983, "learning_rate": 1.4396185467330974e-05, "loss": 1.5067, "step": 135 }, { "epoch": 0.3756906077348066, "grad_norm": 0.6311376690864563, "learning_rate": 1.4315848657096006e-05, "loss": 1.4958, "step": 136 }, { "epoch": 0.3784530386740331, "grad_norm": 0.6299065947532654, "learning_rate": 1.4235168069128657e-05, "loss": 1.4514, "step": 137 }, { "epoch": 0.3812154696132597, "grad_norm": 0.7021288275718689, "learning_rate": 1.4154150130018867e-05, "loss": 1.4633, "step": 138 }, { "epoch": 0.3839779005524862, "grad_norm": 0.6808854937553406, "learning_rate": 1.407280129322819e-05, "loss": 1.5116, "step": 139 }, { "epoch": 0.3867403314917127, "grad_norm": 0.6397327780723572, "learning_rate": 1.3991128038575741e-05, "loss": 1.4773, "step": 140 }, { "epoch": 0.38950276243093923, "grad_norm": 0.6835840344429016, "learning_rate": 1.3909136871722066e-05, "loss": 1.4515, "step": 141 }, { "epoch": 0.39226519337016574, "grad_norm": 0.6662083268165588, "learning_rate": 1.3826834323650899e-05, "loss": 1.4796, "step": 142 }, { "epoch": 0.39502762430939226, "grad_norm": 0.6650438904762268, "learning_rate": 1.374422695014897e-05, "loss": 1.5343, "step": 143 }, { "epoch": 0.39779005524861877, "grad_norm": 0.6556645035743713, "learning_rate": 1.3661321331283796e-05, "loss": 1.5149, "step": 144 }, { "epoch": 0.4005524861878453, "grad_norm": 0.7009831666946411, "learning_rate": 1.3578124070879534e-05, "loss": 1.3801, "step": 145 }, { "epoch": 0.40331491712707185, "grad_norm": 0.6953743696212769, "learning_rate": 1.3494641795990986e-05, "loss": 1.3673, "step": 146 }, { "epoch": 0.40607734806629836, "grad_norm": 0.7101163268089294, "learning_rate": 1.3410881156375684e-05, "loss": 1.4491, "step": 147 }, { "epoch": 0.4088397790055249, "grad_norm": 0.6939963102340698, "learning_rate": 1.3326848823964243e-05, "loss": 1.4347, "step": 148 }, { "epoch": 0.4116022099447514, "grad_norm": 0.7195811867713928, "learning_rate": 1.3242551492328875e-05, "loss": 1.4454, "step": 149 }, { "epoch": 0.4143646408839779, "grad_norm": 0.7362983226776123, "learning_rate": 1.3157995876150252e-05, "loss": 1.4402, "step": 150 }, { "epoch": 0.4171270718232044, "grad_norm": 0.706754744052887, "learning_rate": 1.3073188710682612e-05, "loss": 1.2887, "step": 151 }, { "epoch": 0.4198895027624309, "grad_norm": 0.7431464195251465, "learning_rate": 1.2988136751217292e-05, "loss": 1.4228, "step": 152 }, { "epoch": 0.42265193370165743, "grad_norm": 0.7429276704788208, "learning_rate": 1.2902846772544625e-05, "loss": 1.5767, "step": 153 }, { "epoch": 0.425414364640884, "grad_norm": 0.672516405582428, "learning_rate": 1.2817325568414299e-05, "loss": 1.4013, "step": 154 }, { "epoch": 0.4281767955801105, "grad_norm": 0.7116546630859375, "learning_rate": 1.27315799509942e-05, "loss": 1.4784, "step": 155 }, { "epoch": 0.430939226519337, "grad_norm": 0.7850514054298401, "learning_rate": 1.2645616750327792e-05, "loss": 1.4335, "step": 156 }, { "epoch": 0.43370165745856354, "grad_norm": 0.7218865752220154, "learning_rate": 1.2559442813790077e-05, "loss": 1.4628, "step": 157 }, { "epoch": 0.43646408839779005, "grad_norm": 0.746462881565094, "learning_rate": 1.2473065005542155e-05, "loss": 1.4531, "step": 158 }, { "epoch": 0.43922651933701656, "grad_norm": 0.7100480794906616, "learning_rate": 1.2386490205984488e-05, "loss": 1.4729, "step": 159 }, { "epoch": 0.4419889502762431, "grad_norm": 0.7361482977867126, "learning_rate": 1.2299725311208807e-05, "loss": 1.5175, "step": 160 }, { "epoch": 0.4447513812154696, "grad_norm": 0.7085789442062378, "learning_rate": 1.2212777232448837e-05, "loss": 1.4351, "step": 161 }, { "epoch": 0.44751381215469616, "grad_norm": 0.6986429691314697, "learning_rate": 1.2125652895529766e-05, "loss": 1.484, "step": 162 }, { "epoch": 0.45027624309392267, "grad_norm": 0.656014084815979, "learning_rate": 1.2038359240316589e-05, "loss": 1.4858, "step": 163 }, { "epoch": 0.4530386740331492, "grad_norm": 0.7168847322463989, "learning_rate": 1.1950903220161286e-05, "loss": 1.4222, "step": 164 }, { "epoch": 0.4558011049723757, "grad_norm": 0.6796424388885498, "learning_rate": 1.186329180134898e-05, "loss": 1.5088, "step": 165 }, { "epoch": 0.4585635359116022, "grad_norm": 0.6602550148963928, "learning_rate": 1.1775531962543036e-05, "loss": 1.2236, "step": 166 }, { "epoch": 0.4613259668508287, "grad_norm": 0.7656331062316895, "learning_rate": 1.1687630694229159e-05, "loss": 1.4906, "step": 167 }, { "epoch": 0.46408839779005523, "grad_norm": 0.7794011235237122, "learning_rate": 1.1599594998158602e-05, "loss": 1.4335, "step": 168 }, { "epoch": 0.46685082872928174, "grad_norm": 0.7844555377960205, "learning_rate": 1.1511431886790407e-05, "loss": 1.3969, "step": 169 }, { "epoch": 0.4696132596685083, "grad_norm": 0.6949150562286377, "learning_rate": 1.1423148382732854e-05, "loss": 1.4488, "step": 170 }, { "epoch": 0.4723756906077348, "grad_norm": 0.866104781627655, "learning_rate": 1.1334751518184062e-05, "loss": 1.3399, "step": 171 }, { "epoch": 0.47513812154696133, "grad_norm": 0.763599157333374, "learning_rate": 1.124624833437186e-05, "loss": 1.5064, "step": 172 }, { "epoch": 0.47790055248618785, "grad_norm": 0.7990726232528687, "learning_rate": 1.1157645880992901e-05, "loss": 1.4328, "step": 173 }, { "epoch": 0.48066298342541436, "grad_norm": 0.782218873500824, "learning_rate": 1.1068951215651132e-05, "loss": 1.3896, "step": 174 }, { "epoch": 0.48342541436464087, "grad_norm": 0.741022527217865, "learning_rate": 1.098017140329561e-05, "loss": 1.3803, "step": 175 }, { "epoch": 0.4861878453038674, "grad_norm": 0.8042486310005188, "learning_rate": 1.089131351565776e-05, "loss": 1.4333, "step": 176 }, { "epoch": 0.4889502762430939, "grad_norm": 0.7474654316902161, "learning_rate": 1.080238463068808e-05, "loss": 1.4138, "step": 177 }, { "epoch": 0.49171270718232046, "grad_norm": 0.7193496227264404, "learning_rate": 1.0713391831992324e-05, "loss": 1.4721, "step": 178 }, { "epoch": 0.494475138121547, "grad_norm": 0.7830453515052795, "learning_rate": 1.0624342208267293e-05, "loss": 1.475, "step": 179 }, { "epoch": 0.4972375690607735, "grad_norm": 0.6899390816688538, "learning_rate": 1.0535242852736152e-05, "loss": 1.3806, "step": 180 }, { "epoch": 0.5, "grad_norm": 0.779139518737793, "learning_rate": 1.0446100862583459e-05, "loss": 1.4362, "step": 181 }, { "epoch": 0.5027624309392266, "grad_norm": 0.7901574969291687, "learning_rate": 1.0356923338389807e-05, "loss": 1.5008, "step": 182 }, { "epoch": 0.5027624309392266, "eval_loss": 1.09184730052948, "eval_runtime": 154.1688, "eval_samples_per_second": 9.23, "eval_steps_per_second": 0.292, "step": 182 }, { "epoch": 0.505524861878453, "grad_norm": 0.7908266186714172, "learning_rate": 1.0267717383566247e-05, "loss": 1.4892, "step": 183 }, { "epoch": 0.5082872928176796, "grad_norm": 0.7942709922790527, "learning_rate": 1.0178490103788462e-05, "loss": 1.391, "step": 184 }, { "epoch": 0.511049723756906, "grad_norm": 0.76580411195755, "learning_rate": 1.0089248606430775e-05, "loss": 1.382, "step": 185 }, { "epoch": 0.5138121546961326, "grad_norm": 0.7731238603591919, "learning_rate": 1e-05, "loss": 1.4122, "step": 186 }, { "epoch": 0.5165745856353591, "grad_norm": 0.8293752074241638, "learning_rate": 9.910751393569228e-06, "loss": 1.5193, "step": 187 }, { "epoch": 0.5193370165745856, "grad_norm": 0.9073200225830078, "learning_rate": 9.82150989621154e-06, "loss": 1.3044, "step": 188 }, { "epoch": 0.5220994475138122, "grad_norm": 0.7942474484443665, "learning_rate": 9.732282616433756e-06, "loss": 1.4997, "step": 189 }, { "epoch": 0.5248618784530387, "grad_norm": 0.7780297994613647, "learning_rate": 9.643076661610197e-06, "loss": 1.4907, "step": 190 }, { "epoch": 0.5276243093922652, "grad_norm": 0.8634337782859802, "learning_rate": 9.553899137416546e-06, "loss": 1.3418, "step": 191 }, { "epoch": 0.5303867403314917, "grad_norm": 0.7954802513122559, "learning_rate": 9.464757147263849e-06, "loss": 1.4508, "step": 192 }, { "epoch": 0.5331491712707183, "grad_norm": 0.8032029271125793, "learning_rate": 9.37565779173271e-06, "loss": 1.4302, "step": 193 }, { "epoch": 0.5359116022099447, "grad_norm": 0.7397053837776184, "learning_rate": 9.286608168007678e-06, "loss": 1.5004, "step": 194 }, { "epoch": 0.5386740331491713, "grad_norm": 0.9212160706520081, "learning_rate": 9.197615369311926e-06, "loss": 1.4778, "step": 195 }, { "epoch": 0.5414364640883977, "grad_norm": 0.779822587966919, "learning_rate": 9.108686484342241e-06, "loss": 1.4482, "step": 196 }, { "epoch": 0.5441988950276243, "grad_norm": 0.7739251852035522, "learning_rate": 9.019828596704394e-06, "loss": 1.4599, "step": 197 }, { "epoch": 0.5469613259668509, "grad_norm": 0.7287247776985168, "learning_rate": 8.931048784348875e-06, "loss": 1.4667, "step": 198 }, { "epoch": 0.5497237569060773, "grad_norm": 0.758631706237793, "learning_rate": 8.8423541190071e-06, "loss": 1.3432, "step": 199 }, { "epoch": 0.5524861878453039, "grad_norm": 0.7681453824043274, "learning_rate": 8.753751665628141e-06, "loss": 1.5143, "step": 200 }, { "epoch": 0.5552486187845304, "grad_norm": 0.8416883945465088, "learning_rate": 8.665248481815941e-06, "loss": 1.362, "step": 201 }, { "epoch": 0.5580110497237569, "grad_norm": 0.7365747094154358, "learning_rate": 8.576851617267151e-06, "loss": 1.4192, "step": 202 }, { "epoch": 0.5607734806629834, "grad_norm": 0.8545944690704346, "learning_rate": 8.488568113209593e-06, "loss": 1.4204, "step": 203 }, { "epoch": 0.56353591160221, "grad_norm": 0.8342128396034241, "learning_rate": 8.4004050018414e-06, "loss": 1.3981, "step": 204 }, { "epoch": 0.5662983425414365, "grad_norm": 0.7602656483650208, "learning_rate": 8.312369305770843e-06, "loss": 1.437, "step": 205 }, { "epoch": 0.569060773480663, "grad_norm": 0.8728726506233215, "learning_rate": 8.224468037456969e-06, "loss": 1.332, "step": 206 }, { "epoch": 0.5718232044198895, "grad_norm": 0.845259428024292, "learning_rate": 8.136708198651022e-06, "loss": 1.4522, "step": 207 }, { "epoch": 0.574585635359116, "grad_norm": 0.8054398894309998, "learning_rate": 8.04909677983872e-06, "loss": 1.3441, "step": 208 }, { "epoch": 0.5773480662983426, "grad_norm": 0.7871323227882385, "learning_rate": 7.961640759683416e-06, "loss": 1.4925, "step": 209 }, { "epoch": 0.580110497237569, "grad_norm": 0.8431714177131653, "learning_rate": 7.874347104470234e-06, "loss": 1.3429, "step": 210 }, { "epoch": 0.5828729281767956, "grad_norm": 0.8485934138298035, "learning_rate": 7.787222767551164e-06, "loss": 1.4277, "step": 211 }, { "epoch": 0.585635359116022, "grad_norm": 0.8744496703147888, "learning_rate": 7.700274688791196e-06, "loss": 1.436, "step": 212 }, { "epoch": 0.5883977900552486, "grad_norm": 0.8178616762161255, "learning_rate": 7.613509794015517e-06, "loss": 1.4106, "step": 213 }, { "epoch": 0.5911602209944752, "grad_norm": 0.8785072565078735, "learning_rate": 7.5269349944578454e-06, "loss": 1.461, "step": 214 }, { "epoch": 0.5939226519337016, "grad_norm": 0.7647153735160828, "learning_rate": 7.440557186209927e-06, "loss": 1.3674, "step": 215 }, { "epoch": 0.5966850828729282, "grad_norm": 0.7870045900344849, "learning_rate": 7.354383249672212e-06, "loss": 1.4552, "step": 216 }, { "epoch": 0.5994475138121547, "grad_norm": 0.791970431804657, "learning_rate": 7.268420049005806e-06, "loss": 1.4316, "step": 217 }, { "epoch": 0.6022099447513812, "grad_norm": 0.8452421426773071, "learning_rate": 7.182674431585703e-06, "loss": 1.4539, "step": 218 }, { "epoch": 0.6049723756906077, "grad_norm": 0.9066639542579651, "learning_rate": 7.097153227455379e-06, "loss": 1.4379, "step": 219 }, { "epoch": 0.6077348066298343, "grad_norm": 0.8649947643280029, "learning_rate": 7.011863248782711e-06, "loss": 1.3483, "step": 220 }, { "epoch": 0.6104972375690608, "grad_norm": 0.8475552201271057, "learning_rate": 6.92681128931739e-06, "loss": 1.4798, "step": 221 }, { "epoch": 0.6132596685082873, "grad_norm": 0.8348631262779236, "learning_rate": 6.8420041238497525e-06, "loss": 1.4084, "step": 222 }, { "epoch": 0.6160220994475138, "grad_norm": 0.9010429382324219, "learning_rate": 6.7574485076711285e-06, "loss": 1.4011, "step": 223 }, { "epoch": 0.6187845303867403, "grad_norm": 0.8535116910934448, "learning_rate": 6.673151176035762e-06, "loss": 1.3661, "step": 224 }, { "epoch": 0.6215469613259669, "grad_norm": 0.8269426822662354, "learning_rate": 6.589118843624316e-06, "loss": 1.3894, "step": 225 }, { "epoch": 0.6243093922651933, "grad_norm": 0.76876300573349, "learning_rate": 6.505358204009018e-06, "loss": 1.4496, "step": 226 }, { "epoch": 0.6270718232044199, "grad_norm": 0.7975645065307617, "learning_rate": 6.421875929120469e-06, "loss": 1.3987, "step": 227 }, { "epoch": 0.6298342541436464, "grad_norm": 0.8084754347801208, "learning_rate": 6.33867866871621e-06, "loss": 1.4141, "step": 228 }, { "epoch": 0.6325966850828729, "grad_norm": 0.7989472150802612, "learning_rate": 6.25577304985103e-06, "loss": 1.2641, "step": 229 }, { "epoch": 0.6353591160220995, "grad_norm": 0.8317887783050537, "learning_rate": 6.173165676349103e-06, "loss": 1.3783, "step": 230 }, { "epoch": 0.638121546961326, "grad_norm": 0.883558452129364, "learning_rate": 6.090863128277938e-06, "loss": 1.3727, "step": 231 }, { "epoch": 0.6408839779005525, "grad_norm": 0.7857452630996704, "learning_rate": 6.008871961424259e-06, "loss": 1.4159, "step": 232 }, { "epoch": 0.643646408839779, "grad_norm": 0.7758345007896423, "learning_rate": 5.927198706771813e-06, "loss": 1.4271, "step": 233 }, { "epoch": 0.6464088397790055, "grad_norm": 0.8178966045379639, "learning_rate": 5.845849869981137e-06, "loss": 1.4526, "step": 234 }, { "epoch": 0.649171270718232, "grad_norm": 0.8668413758277893, "learning_rate": 5.764831930871346e-06, "loss": 1.4209, "step": 235 }, { "epoch": 0.6519337016574586, "grad_norm": 0.866632342338562, "learning_rate": 5.684151342903992e-06, "loss": 1.4785, "step": 236 }, { "epoch": 0.6546961325966851, "grad_norm": 0.8386754393577576, "learning_rate": 5.603814532669032e-06, "loss": 1.4133, "step": 237 }, { "epoch": 0.6574585635359116, "grad_norm": 0.8321985006332397, "learning_rate": 5.523827899372876e-06, "loss": 1.4049, "step": 238 }, { "epoch": 0.6602209944751382, "grad_norm": 0.8785558342933655, "learning_rate": 5.444197814328707e-06, "loss": 1.3697, "step": 239 }, { "epoch": 0.6629834254143646, "grad_norm": 0.7912325263023376, "learning_rate": 5.364930620448946e-06, "loss": 1.4928, "step": 240 }, { "epoch": 0.6657458563535912, "grad_norm": 0.8722138404846191, "learning_rate": 5.286032631740023e-06, "loss": 1.4547, "step": 241 }, { "epoch": 0.6685082872928176, "grad_norm": 0.8593927621841431, "learning_rate": 5.207510132799436e-06, "loss": 1.4188, "step": 242 }, { "epoch": 0.6712707182320442, "grad_norm": 0.7740910649299622, "learning_rate": 5.129369378315128e-06, "loss": 1.4642, "step": 243 }, { "epoch": 0.6740331491712708, "grad_norm": 0.8590474724769592, "learning_rate": 5.051616592567323e-06, "loss": 1.2543, "step": 244 }, { "epoch": 0.6767955801104972, "grad_norm": 0.862617015838623, "learning_rate": 4.974257968932687e-06, "loss": 1.4393, "step": 245 }, { "epoch": 0.6795580110497238, "grad_norm": 0.8476159572601318, "learning_rate": 4.897299669391006e-06, "loss": 1.4986, "step": 246 }, { "epoch": 0.6823204419889503, "grad_norm": 0.8105822205543518, "learning_rate": 4.820747824034369e-06, "loss": 1.5027, "step": 247 }, { "epoch": 0.6850828729281768, "grad_norm": 0.9459635615348816, "learning_rate": 4.744608530578872e-06, "loss": 1.3336, "step": 248 }, { "epoch": 0.6878453038674033, "grad_norm": 0.8690259456634521, "learning_rate": 4.668887853878896e-06, "loss": 1.3479, "step": 249 }, { "epoch": 0.6906077348066298, "grad_norm": 0.8513994812965393, "learning_rate": 4.593591825444028e-06, "loss": 1.5261, "step": 250 }, { "epoch": 0.6933701657458563, "grad_norm": 0.8456645011901855, "learning_rate": 4.518726442958599e-06, "loss": 1.4669, "step": 251 }, { "epoch": 0.6961325966850829, "grad_norm": 0.8776260018348694, "learning_rate": 4.444297669803981e-06, "loss": 1.4177, "step": 252 }, { "epoch": 0.6988950276243094, "grad_norm": 0.860339343547821, "learning_rate": 4.370311434583525e-06, "loss": 1.454, "step": 253 }, { "epoch": 0.7016574585635359, "grad_norm": 0.902574360370636, "learning_rate": 4.296773630650358e-06, "loss": 1.3987, "step": 254 }, { "epoch": 0.7044198895027625, "grad_norm": 0.8529084920883179, "learning_rate": 4.223690115637944e-06, "loss": 1.4428, "step": 255 }, { "epoch": 0.7071823204419889, "grad_norm": 0.8335188627243042, "learning_rate": 4.15106671099347e-06, "loss": 1.4812, "step": 256 }, { "epoch": 0.7099447513812155, "grad_norm": 0.8945775628089905, "learning_rate": 4.078909201514172e-06, "loss": 1.3559, "step": 257 }, { "epoch": 0.712707182320442, "grad_norm": 0.9545679688453674, "learning_rate": 4.007223334886531e-06, "loss": 1.3806, "step": 258 }, { "epoch": 0.7154696132596685, "grad_norm": 0.7985166311264038, "learning_rate": 3.936014821228448e-06, "loss": 1.3433, "step": 259 }, { "epoch": 0.7182320441988951, "grad_norm": 0.8410876393318176, "learning_rate": 3.865289332634407e-06, "loss": 1.4393, "step": 260 }, { "epoch": 0.7209944751381215, "grad_norm": 0.8620865941047668, "learning_rate": 3.7950525027236585e-06, "loss": 1.2242, "step": 261 }, { "epoch": 0.7237569060773481, "grad_norm": 0.8445248603820801, "learning_rate": 3.7253099261914794e-06, "loss": 1.403, "step": 262 }, { "epoch": 0.7265193370165746, "grad_norm": 0.8497399687767029, "learning_rate": 3.6560671583635467e-06, "loss": 1.4453, "step": 263 }, { "epoch": 0.7292817679558011, "grad_norm": 0.83712238073349, "learning_rate": 3.5873297147533913e-06, "loss": 1.503, "step": 264 }, { "epoch": 0.7320441988950276, "grad_norm": 0.8133417963981628, "learning_rate": 3.5191030706230967e-06, "loss": 1.3821, "step": 265 }, { "epoch": 0.7348066298342542, "grad_norm": 0.8870840668678284, "learning_rate": 3.4513926605471504e-06, "loss": 1.4533, "step": 266 }, { "epoch": 0.7375690607734806, "grad_norm": 0.9070349335670471, "learning_rate": 3.3842038779795594e-06, "loss": 1.4055, "step": 267 }, { "epoch": 0.7403314917127072, "grad_norm": 0.8012280464172363, "learning_rate": 3.3175420748242405e-06, "loss": 1.2719, "step": 268 }, { "epoch": 0.7430939226519337, "grad_norm": 0.8506048321723938, "learning_rate": 3.2514125610086957e-06, "loss": 1.3588, "step": 269 }, { "epoch": 0.7458563535911602, "grad_norm": 0.7189351916313171, "learning_rate": 3.1858206040610883e-06, "loss": 1.2039, "step": 270 }, { "epoch": 0.7486187845303868, "grad_norm": 0.8134811520576477, "learning_rate": 3.1207714286906253e-06, "loss": 1.2114, "step": 271 }, { "epoch": 0.7513812154696132, "grad_norm": 0.8526425957679749, "learning_rate": 3.0562702163713954e-06, "loss": 1.5044, "step": 272 }, { "epoch": 0.7541436464088398, "grad_norm": 0.9256671071052551, "learning_rate": 2.9923221049296448e-06, "loss": 1.3762, "step": 273 }, { "epoch": 0.7541436464088398, "eval_loss": 1.0853379964828491, "eval_runtime": 155.2763, "eval_samples_per_second": 9.164, "eval_steps_per_second": 0.29, "step": 273 }, { "epoch": 0.7569060773480663, "grad_norm": 1.0012723207473755, "learning_rate": 2.9289321881345257e-06, "loss": 1.4391, "step": 274 }, { "epoch": 0.7596685082872928, "grad_norm": 0.9301908016204834, "learning_rate": 2.8661055152923456e-06, "loss": 1.3913, "step": 275 }, { "epoch": 0.7624309392265194, "grad_norm": 0.9066746830940247, "learning_rate": 2.8038470908443717e-06, "loss": 1.3527, "step": 276 }, { "epoch": 0.7651933701657458, "grad_norm": 0.8317292332649231, "learning_rate": 2.742161873968202e-06, "loss": 1.4783, "step": 277 }, { "epoch": 0.7679558011049724, "grad_norm": 0.892299473285675, "learning_rate": 2.681054778182748e-06, "loss": 1.3643, "step": 278 }, { "epoch": 0.7707182320441989, "grad_norm": 0.8800353407859802, "learning_rate": 2.6205306709568358e-06, "loss": 1.4015, "step": 279 }, { "epoch": 0.7734806629834254, "grad_norm": 0.8604403138160706, "learning_rate": 2.5605943733215044e-06, "loss": 1.4247, "step": 280 }, { "epoch": 0.7762430939226519, "grad_norm": 0.8747833967208862, "learning_rate": 2.501250659485992e-06, "loss": 1.3748, "step": 281 }, { "epoch": 0.7790055248618785, "grad_norm": 0.8854520320892334, "learning_rate": 2.4425042564574186e-06, "loss": 1.4518, "step": 282 }, { "epoch": 0.7817679558011049, "grad_norm": 0.8976154327392578, "learning_rate": 2.38435984366429e-06, "loss": 1.3765, "step": 283 }, { "epoch": 0.7845303867403315, "grad_norm": 0.8289183378219604, "learning_rate": 2.3268220525837436e-06, "loss": 1.4559, "step": 284 }, { "epoch": 0.787292817679558, "grad_norm": 0.8268212080001831, "learning_rate": 2.26989546637263e-06, "loss": 1.4105, "step": 285 }, { "epoch": 0.7900552486187845, "grad_norm": 0.7702677249908447, "learning_rate": 2.213584619502451e-06, "loss": 1.4579, "step": 286 }, { "epoch": 0.7928176795580111, "grad_norm": 0.9030853509902954, "learning_rate": 2.157893997398146e-06, "loss": 1.3844, "step": 287 }, { "epoch": 0.7955801104972375, "grad_norm": 0.8461067080497742, "learning_rate": 2.1028280360808405e-06, "loss": 1.4283, "step": 288 }, { "epoch": 0.7983425414364641, "grad_norm": 0.860618531703949, "learning_rate": 2.0483911218144713e-06, "loss": 1.4576, "step": 289 }, { "epoch": 0.8011049723756906, "grad_norm": 0.9135028123855591, "learning_rate": 1.994587590756397e-06, "loss": 1.3549, "step": 290 }, { "epoch": 0.8038674033149171, "grad_norm": 0.936713457107544, "learning_rate": 1.941421728612023e-06, "loss": 1.4119, "step": 291 }, { "epoch": 0.8066298342541437, "grad_norm": 0.9670917987823486, "learning_rate": 1.8888977702934086e-06, "loss": 1.4523, "step": 292 }, { "epoch": 0.8093922651933702, "grad_norm": 0.927078902721405, "learning_rate": 1.8370198995819432e-06, "loss": 1.405, "step": 293 }, { "epoch": 0.8121546961325967, "grad_norm": 0.8833804130554199, "learning_rate": 1.7857922487950873e-06, "loss": 1.4527, "step": 294 }, { "epoch": 0.8149171270718232, "grad_norm": 0.821182131767273, "learning_rate": 1.7352188984572026e-06, "loss": 1.3541, "step": 295 }, { "epoch": 0.8176795580110497, "grad_norm": 0.8782601952552795, "learning_rate": 1.6853038769745466e-06, "loss": 1.45, "step": 296 }, { "epoch": 0.8204419889502762, "grad_norm": 0.8677568435668945, "learning_rate": 1.6360511603143648e-06, "loss": 1.4061, "step": 297 }, { "epoch": 0.8232044198895028, "grad_norm": 0.9125531911849976, "learning_rate": 1.587464671688187e-06, "loss": 1.4138, "step": 298 }, { "epoch": 0.8259668508287292, "grad_norm": 0.8667160272598267, "learning_rate": 1.5395482812393513e-06, "loss": 1.4394, "step": 299 }, { "epoch": 0.8287292817679558, "grad_norm": 0.9092755913734436, "learning_rate": 1.492305805734693e-06, "loss": 1.4223, "step": 300 }, { "epoch": 0.8314917127071824, "grad_norm": 0.813664436340332, "learning_rate": 1.4457410082605483e-06, "loss": 1.4421, "step": 301 }, { "epoch": 0.8342541436464088, "grad_norm": 0.9390541911125183, "learning_rate": 1.3998575979229944e-06, "loss": 1.3292, "step": 302 }, { "epoch": 0.8370165745856354, "grad_norm": 0.9930654168128967, "learning_rate": 1.3546592295524075e-06, "loss": 1.4802, "step": 303 }, { "epoch": 0.8397790055248618, "grad_norm": 0.8965404629707336, "learning_rate": 1.3101495034123313e-06, "loss": 1.4596, "step": 304 }, { "epoch": 0.8425414364640884, "grad_norm": 0.9070413112640381, "learning_rate": 1.2663319649127025e-06, "loss": 1.4076, "step": 305 }, { "epoch": 0.8453038674033149, "grad_norm": 0.7749320268630981, "learning_rate": 1.2232101043274437e-06, "loss": 1.2553, "step": 306 }, { "epoch": 0.8480662983425414, "grad_norm": 0.949621856212616, "learning_rate": 1.1807873565164507e-06, "loss": 1.3909, "step": 307 }, { "epoch": 0.850828729281768, "grad_norm": 0.8850001096725464, "learning_rate": 1.139067100651976e-06, "loss": 1.3992, "step": 308 }, { "epoch": 0.8535911602209945, "grad_norm": 0.8339431285858154, "learning_rate": 1.0980526599494733e-06, "loss": 1.4141, "step": 309 }, { "epoch": 0.856353591160221, "grad_norm": 0.8964627385139465, "learning_rate": 1.0577473014028872e-06, "loss": 1.3828, "step": 310 }, { "epoch": 0.8591160220994475, "grad_norm": 0.8783918023109436, "learning_rate": 1.0181542355244167e-06, "loss": 1.3791, "step": 311 }, { "epoch": 0.861878453038674, "grad_norm": 0.8714961409568787, "learning_rate": 9.792766160887868e-07, "loss": 1.4513, "step": 312 }, { "epoch": 0.8646408839779005, "grad_norm": 0.8861087560653687, "learning_rate": 9.411175398820271e-07, "loss": 1.3653, "step": 313 }, { "epoch": 0.8674033149171271, "grad_norm": 0.8015440702438354, "learning_rate": 9.036800464548157e-07, "loss": 1.4818, "step": 314 }, { "epoch": 0.8701657458563536, "grad_norm": 0.8085585832595825, "learning_rate": 8.669671178803485e-07, "loss": 1.4441, "step": 315 }, { "epoch": 0.8729281767955801, "grad_norm": 0.8612155318260193, "learning_rate": 8.309816785168035e-07, "loss": 1.3938, "step": 316 }, { "epoch": 0.8756906077348067, "grad_norm": 0.866340160369873, "learning_rate": 7.957265947744131e-07, "loss": 1.3557, "step": 317 }, { "epoch": 0.8784530386740331, "grad_norm": 0.8810187578201294, "learning_rate": 7.612046748871327e-07, "loss": 1.3672, "step": 318 }, { "epoch": 0.8812154696132597, "grad_norm": 0.9028764367103577, "learning_rate": 7.274186686889539e-07, "loss": 1.411, "step": 319 }, { "epoch": 0.8839779005524862, "grad_norm": 0.9450622797012329, "learning_rate": 6.943712673948643e-07, "loss": 1.3494, "step": 320 }, { "epoch": 0.8867403314917127, "grad_norm": 0.7640666365623474, "learning_rate": 6.620651033864844e-07, "loss": 1.4108, "step": 321 }, { "epoch": 0.8895027624309392, "grad_norm": 0.8005813360214233, "learning_rate": 6.305027500023841e-07, "loss": 1.4023, "step": 322 }, { "epoch": 0.8922651933701657, "grad_norm": 0.8076258301734924, "learning_rate": 5.996867213330993e-07, "loss": 1.4143, "step": 323 }, { "epoch": 0.8950276243093923, "grad_norm": 0.9263111352920532, "learning_rate": 5.696194720208792e-07, "loss": 1.4247, "step": 324 }, { "epoch": 0.8977900552486188, "grad_norm": 0.7982999682426453, "learning_rate": 5.403033970641647e-07, "loss": 1.4022, "step": 325 }, { "epoch": 0.9005524861878453, "grad_norm": 0.8366706967353821, "learning_rate": 5.117408316268047e-07, "loss": 1.5076, "step": 326 }, { "epoch": 0.9033149171270718, "grad_norm": 0.8495275974273682, "learning_rate": 4.839340508520563e-07, "loss": 1.4251, "step": 327 }, { "epoch": 0.9060773480662984, "grad_norm": 0.8747670650482178, "learning_rate": 4.5688526968136193e-07, "loss": 1.3898, "step": 328 }, { "epoch": 0.9088397790055248, "grad_norm": 0.8361295461654663, "learning_rate": 4.305966426779118e-07, "loss": 1.4397, "step": 329 }, { "epoch": 0.9116022099447514, "grad_norm": 0.8162137866020203, "learning_rate": 4.0507026385502747e-07, "loss": 1.4023, "step": 330 }, { "epoch": 0.914364640883978, "grad_norm": 0.8866202235221863, "learning_rate": 3.8030816650935777e-07, "loss": 1.3697, "step": 331 }, { "epoch": 0.9171270718232044, "grad_norm": 0.865460216999054, "learning_rate": 3.5631232305893047e-07, "loss": 1.4998, "step": 332 }, { "epoch": 0.919889502762431, "grad_norm": 0.9784380197525024, "learning_rate": 3.3308464488602587e-07, "loss": 1.4262, "step": 333 }, { "epoch": 0.9226519337016574, "grad_norm": 0.8989951610565186, "learning_rate": 3.106269821849273e-07, "loss": 1.3742, "step": 334 }, { "epoch": 0.925414364640884, "grad_norm": 0.8595617413520813, "learning_rate": 2.889411238145545e-07, "loss": 1.4038, "step": 335 }, { "epoch": 0.9281767955801105, "grad_norm": 0.8496826887130737, "learning_rate": 2.6802879715596585e-07, "loss": 1.3938, "step": 336 }, { "epoch": 0.930939226519337, "grad_norm": 0.8510773777961731, "learning_rate": 2.478916679747623e-07, "loss": 1.4617, "step": 337 }, { "epoch": 0.9337016574585635, "grad_norm": 0.8786448240280151, "learning_rate": 2.2853134028840594e-07, "loss": 1.3615, "step": 338 }, { "epoch": 0.93646408839779, "grad_norm": 0.9002432823181152, "learning_rate": 2.099493562384469e-07, "loss": 1.4047, "step": 339 }, { "epoch": 0.9392265193370166, "grad_norm": 0.8121012449264526, "learning_rate": 1.921471959676957e-07, "loss": 1.4307, "step": 340 }, { "epoch": 0.9419889502762431, "grad_norm": 0.8053271770477295, "learning_rate": 1.7512627750230772e-07, "loss": 1.4443, "step": 341 }, { "epoch": 0.9447513812154696, "grad_norm": 0.9050955176353455, "learning_rate": 1.5888795663883904e-07, "loss": 1.4102, "step": 342 }, { "epoch": 0.9475138121546961, "grad_norm": 0.9110968112945557, "learning_rate": 1.4343352683625412e-07, "loss": 1.4198, "step": 343 }, { "epoch": 0.9502762430939227, "grad_norm": 0.929118812084198, "learning_rate": 1.2876421911288906e-07, "loss": 1.4203, "step": 344 }, { "epoch": 0.9530386740331491, "grad_norm": 0.9060593843460083, "learning_rate": 1.148812019483958e-07, "loss": 1.3268, "step": 345 }, { "epoch": 0.9558011049723757, "grad_norm": 0.8292207717895508, "learning_rate": 1.0178558119067316e-07, "loss": 1.3877, "step": 346 }, { "epoch": 0.9585635359116023, "grad_norm": 0.8450669646263123, "learning_rate": 8.947839996777286e-08, "loss": 1.3981, "step": 347 }, { "epoch": 0.9613259668508287, "grad_norm": 0.9085505604743958, "learning_rate": 7.796063860481595e-08, "loss": 1.353, "step": 348 }, { "epoch": 0.9640883977900553, "grad_norm": 0.8617837429046631, "learning_rate": 6.723321454590093e-08, "loss": 1.4215, "step": 349 }, { "epoch": 0.9668508287292817, "grad_norm": 0.8624401092529297, "learning_rate": 5.7296982281026534e-08, "loss": 1.3668, "step": 350 }, { "epoch": 0.9696132596685083, "grad_norm": 0.8622123003005981, "learning_rate": 4.815273327803183e-08, "loss": 1.3723, "step": 351 }, { "epoch": 0.9723756906077348, "grad_norm": 0.7842567563056946, "learning_rate": 3.980119591954101e-08, "loss": 1.3255, "step": 352 }, { "epoch": 0.9751381215469613, "grad_norm": 0.8146648406982422, "learning_rate": 3.224303544495766e-08, "loss": 1.3913, "step": 353 }, { "epoch": 0.9779005524861878, "grad_norm": 0.8488260507583618, "learning_rate": 2.547885389746485e-08, "loss": 1.385, "step": 354 }, { "epoch": 0.9806629834254144, "grad_norm": 0.8932926058769226, "learning_rate": 1.9509190076074657e-08, "loss": 1.3992, "step": 355 }, { "epoch": 0.9834254143646409, "grad_norm": 0.9198254346847534, "learning_rate": 1.4334519492711362e-08, "loss": 1.3375, "step": 356 }, { "epoch": 0.9861878453038674, "grad_norm": 0.9434136748313904, "learning_rate": 9.955254334328424e-09, "loss": 1.3329, "step": 357 }, { "epoch": 0.988950276243094, "grad_norm": 0.8083603978157043, "learning_rate": 6.371743430082511e-09, "loss": 1.4152, "step": 358 }, { "epoch": 0.9917127071823204, "grad_norm": 0.8143340349197388, "learning_rate": 3.5842722235468475e-09, "loss": 1.3503, "step": 359 }, { "epoch": 0.994475138121547, "grad_norm": 0.8487191200256348, "learning_rate": 1.593062749967178e-09, "loss": 1.4374, "step": 360 }, { "epoch": 0.9972375690607734, "grad_norm": 0.8385122418403625, "learning_rate": 3.982736185859093e-10, "loss": 1.384, "step": 361 }, { "epoch": 1.0, "grad_norm": 0.8801374435424805, "learning_rate": 0.0, "loss": 1.368, "step": 362 } ], "logging_steps": 1, "max_steps": 362, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6738557053055795e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }