|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 91, |
|
"global_step": 362, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027624309392265192, |
|
"grad_norm": 0.42503952980041504, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.7314, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0027624309392265192, |
|
"eval_loss": 1.3109512329101562, |
|
"eval_runtime": 138.9562, |
|
"eval_samples_per_second": 10.241, |
|
"eval_steps_per_second": 0.324, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0055248618784530384, |
|
"grad_norm": 0.4957458972930908, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.8536, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008287292817679558, |
|
"grad_norm": 0.4254479706287384, |
|
"learning_rate": 6e-06, |
|
"loss": 1.7545, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011049723756906077, |
|
"grad_norm": 0.4761411249637604, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.7814, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013812154696132596, |
|
"grad_norm": 0.47434934973716736, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7703, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016574585635359115, |
|
"grad_norm": 0.4699034094810486, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.83, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.019337016574585635, |
|
"grad_norm": 0.4335905909538269, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.6648, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.022099447513812154, |
|
"grad_norm": 0.4264945685863495, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.7164, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.024861878453038673, |
|
"grad_norm": 0.417251318693161, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.7058, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.027624309392265192, |
|
"grad_norm": 0.42205414175987244, |
|
"learning_rate": 2e-05, |
|
"loss": 1.71, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03038674033149171, |
|
"grad_norm": 0.414096862077713, |
|
"learning_rate": 1.9999601726381415e-05, |
|
"loss": 1.6954, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03314917127071823, |
|
"grad_norm": 0.412890762090683, |
|
"learning_rate": 1.9998406937250035e-05, |
|
"loss": 1.6941, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03591160220994475, |
|
"grad_norm": 0.4428125321865082, |
|
"learning_rate": 1.9996415727776456e-05, |
|
"loss": 1.6439, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03867403314917127, |
|
"grad_norm": 0.4378657341003418, |
|
"learning_rate": 1.999362825656992e-05, |
|
"loss": 1.7518, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04143646408839779, |
|
"grad_norm": 0.4670448303222656, |
|
"learning_rate": 1.9990044745665672e-05, |
|
"loss": 1.6461, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04419889502762431, |
|
"grad_norm": 0.44707468152046204, |
|
"learning_rate": 1.998566548050729e-05, |
|
"loss": 1.7201, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04696132596685083, |
|
"grad_norm": 0.4317484200000763, |
|
"learning_rate": 1.9980490809923928e-05, |
|
"loss": 1.7697, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.049723756906077346, |
|
"grad_norm": 0.390918105840683, |
|
"learning_rate": 1.9974521146102535e-05, |
|
"loss": 1.6336, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.052486187845303865, |
|
"grad_norm": 0.44283199310302734, |
|
"learning_rate": 1.9967756964555044e-05, |
|
"loss": 1.7058, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.055248618784530384, |
|
"grad_norm": 0.40721943974494934, |
|
"learning_rate": 1.9960198804080462e-05, |
|
"loss": 1.7619, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.058011049723756904, |
|
"grad_norm": 0.3747328817844391, |
|
"learning_rate": 1.995184726672197e-05, |
|
"loss": 1.6998, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06077348066298342, |
|
"grad_norm": 0.3749975860118866, |
|
"learning_rate": 1.9942703017718977e-05, |
|
"loss": 1.7488, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06353591160220995, |
|
"grad_norm": 0.36129963397979736, |
|
"learning_rate": 1.99327667854541e-05, |
|
"loss": 1.6649, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06629834254143646, |
|
"grad_norm": 0.3862815797328949, |
|
"learning_rate": 1.9922039361395186e-05, |
|
"loss": 1.626, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06906077348066299, |
|
"grad_norm": 0.4051111042499542, |
|
"learning_rate": 1.991052160003223e-05, |
|
"loss": 1.6523, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0718232044198895, |
|
"grad_norm": 0.42131727933883667, |
|
"learning_rate": 1.989821441880933e-05, |
|
"loss": 1.8082, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07458563535911603, |
|
"grad_norm": 0.39152222871780396, |
|
"learning_rate": 1.9885118798051607e-05, |
|
"loss": 1.6831, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07734806629834254, |
|
"grad_norm": 0.40526384115219116, |
|
"learning_rate": 1.9871235780887114e-05, |
|
"loss": 1.6613, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08011049723756906, |
|
"grad_norm": 0.3861314654350281, |
|
"learning_rate": 1.9856566473163747e-05, |
|
"loss": 1.6981, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08287292817679558, |
|
"grad_norm": 0.38509440422058105, |
|
"learning_rate": 1.984111204336116e-05, |
|
"loss": 1.6309, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0856353591160221, |
|
"grad_norm": 0.3785051107406616, |
|
"learning_rate": 1.9824873722497694e-05, |
|
"loss": 1.675, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08839779005524862, |
|
"grad_norm": 0.4013555645942688, |
|
"learning_rate": 1.9807852804032306e-05, |
|
"loss": 1.6033, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09116022099447514, |
|
"grad_norm": 0.36987578868865967, |
|
"learning_rate": 1.9790050643761552e-05, |
|
"loss": 1.643, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09392265193370165, |
|
"grad_norm": 0.34048470854759216, |
|
"learning_rate": 1.9771468659711595e-05, |
|
"loss": 1.7037, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09668508287292818, |
|
"grad_norm": 0.370815247297287, |
|
"learning_rate": 1.975210833202524e-05, |
|
"loss": 1.7189, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09944751381215469, |
|
"grad_norm": 0.3450065553188324, |
|
"learning_rate": 1.9731971202844036e-05, |
|
"loss": 1.6371, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10220994475138122, |
|
"grad_norm": 0.3642140328884125, |
|
"learning_rate": 1.9711058876185446e-05, |
|
"loss": 1.6962, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10497237569060773, |
|
"grad_norm": 0.3298833668231964, |
|
"learning_rate": 1.9689373017815076e-05, |
|
"loss": 1.5134, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10773480662983426, |
|
"grad_norm": 0.35403338074684143, |
|
"learning_rate": 1.9666915355113976e-05, |
|
"loss": 1.7167, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11049723756906077, |
|
"grad_norm": 0.36911338567733765, |
|
"learning_rate": 1.964368767694107e-05, |
|
"loss": 1.6674, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1132596685082873, |
|
"grad_norm": 0.38089630007743835, |
|
"learning_rate": 1.9619691833490645e-05, |
|
"loss": 1.5777, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11602209944751381, |
|
"grad_norm": 0.3290576934814453, |
|
"learning_rate": 1.9594929736144978e-05, |
|
"loss": 1.6851, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11878453038674033, |
|
"grad_norm": 0.3029409348964691, |
|
"learning_rate": 1.956940335732209e-05, |
|
"loss": 1.2519, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12154696132596685, |
|
"grad_norm": 0.3706592321395874, |
|
"learning_rate": 1.954311473031864e-05, |
|
"loss": 1.6061, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12430939226519337, |
|
"grad_norm": 0.36404719948768616, |
|
"learning_rate": 1.9516065949147945e-05, |
|
"loss": 1.5515, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1270718232044199, |
|
"grad_norm": 0.33177605271339417, |
|
"learning_rate": 1.9488259168373198e-05, |
|
"loss": 1.5692, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1298342541436464, |
|
"grad_norm": 0.3582829535007477, |
|
"learning_rate": 1.9459696602935838e-05, |
|
"loss": 1.5336, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13259668508287292, |
|
"grad_norm": 0.3565369248390198, |
|
"learning_rate": 1.9430380527979124e-05, |
|
"loss": 1.6251, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13535911602209943, |
|
"grad_norm": 0.3471137285232544, |
|
"learning_rate": 1.94003132786669e-05, |
|
"loss": 1.6245, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13812154696132597, |
|
"grad_norm": 0.4011210501194, |
|
"learning_rate": 1.936949724999762e-05, |
|
"loss": 1.3687, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1408839779005525, |
|
"grad_norm": 0.36058053374290466, |
|
"learning_rate": 1.9337934896613516e-05, |
|
"loss": 1.5283, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.143646408839779, |
|
"grad_norm": 0.39054054021835327, |
|
"learning_rate": 1.930562873260514e-05, |
|
"loss": 1.5784, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1464088397790055, |
|
"grad_norm": 0.39804011583328247, |
|
"learning_rate": 1.927258133131105e-05, |
|
"loss": 1.605, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14917127071823205, |
|
"grad_norm": 0.42572247982025146, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 1.4554, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15193370165745856, |
|
"grad_norm": 0.4022100269794464, |
|
"learning_rate": 1.9204273405225588e-05, |
|
"loss": 1.5905, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15469613259668508, |
|
"grad_norm": 0.43215224146842957, |
|
"learning_rate": 1.9169018321483198e-05, |
|
"loss": 1.6233, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1574585635359116, |
|
"grad_norm": 0.38549479842185974, |
|
"learning_rate": 1.9133032882119656e-05, |
|
"loss": 1.5868, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.16022099447513813, |
|
"grad_norm": 0.42155200242996216, |
|
"learning_rate": 1.9096319953545186e-05, |
|
"loss": 1.4519, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.16298342541436464, |
|
"grad_norm": 0.4292687177658081, |
|
"learning_rate": 1.9058882460117972e-05, |
|
"loss": 1.6053, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.16574585635359115, |
|
"grad_norm": 0.44173118472099304, |
|
"learning_rate": 1.9020723383911214e-05, |
|
"loss": 1.6328, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1685082872928177, |
|
"grad_norm": 0.4508700966835022, |
|
"learning_rate": 1.8981845764475585e-05, |
|
"loss": 1.6, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1712707182320442, |
|
"grad_norm": 0.42624732851982117, |
|
"learning_rate": 1.8942252698597113e-05, |
|
"loss": 1.6086, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.17403314917127072, |
|
"grad_norm": 0.4029814600944519, |
|
"learning_rate": 1.890194734005053e-05, |
|
"loss": 1.5955, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.17679558011049723, |
|
"grad_norm": 0.4393555223941803, |
|
"learning_rate": 1.8860932899348028e-05, |
|
"loss": 1.5735, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17955801104972377, |
|
"grad_norm": 0.40672072768211365, |
|
"learning_rate": 1.881921264348355e-05, |
|
"loss": 1.5975, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18232044198895028, |
|
"grad_norm": 0.4309908151626587, |
|
"learning_rate": 1.8776789895672557e-05, |
|
"loss": 1.5539, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1850828729281768, |
|
"grad_norm": 0.41728851199150085, |
|
"learning_rate": 1.8733668035087302e-05, |
|
"loss": 1.5678, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1878453038674033, |
|
"grad_norm": 0.46532243490219116, |
|
"learning_rate": 1.8689850496587674e-05, |
|
"loss": 1.5277, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.19060773480662985, |
|
"grad_norm": 0.43453872203826904, |
|
"learning_rate": 1.8645340770447595e-05, |
|
"loss": 1.5192, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.19337016574585636, |
|
"grad_norm": 0.46961575746536255, |
|
"learning_rate": 1.8600142402077006e-05, |
|
"loss": 1.4962, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19613259668508287, |
|
"grad_norm": 0.47958558797836304, |
|
"learning_rate": 1.8554258991739454e-05, |
|
"loss": 1.502, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.19889502762430938, |
|
"grad_norm": 0.4123697280883789, |
|
"learning_rate": 1.850769419426531e-05, |
|
"loss": 1.4997, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.20165745856353592, |
|
"grad_norm": 0.5050204396247864, |
|
"learning_rate": 1.8460451718760653e-05, |
|
"loss": 1.5283, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.20441988950276244, |
|
"grad_norm": 0.44690632820129395, |
|
"learning_rate": 1.8412535328311813e-05, |
|
"loss": 1.5429, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.20718232044198895, |
|
"grad_norm": 0.45188507437705994, |
|
"learning_rate": 1.8363948839685638e-05, |
|
"loss": 1.532, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20994475138121546, |
|
"grad_norm": 0.4514811038970947, |
|
"learning_rate": 1.8314696123025456e-05, |
|
"loss": 1.4147, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.212707182320442, |
|
"grad_norm": 0.5114113688468933, |
|
"learning_rate": 1.8264781101542797e-05, |
|
"loss": 1.5552, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2154696132596685, |
|
"grad_norm": 0.51650071144104, |
|
"learning_rate": 1.8214207751204917e-05, |
|
"loss": 1.5554, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.21823204419889503, |
|
"grad_norm": 0.48588883876800537, |
|
"learning_rate": 1.816298010041806e-05, |
|
"loss": 1.5688, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.22099447513812154, |
|
"grad_norm": 0.4869782328605652, |
|
"learning_rate": 1.8111102229706593e-05, |
|
"loss": 1.6062, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22375690607734808, |
|
"grad_norm": 0.4736042320728302, |
|
"learning_rate": 1.805857827138798e-05, |
|
"loss": 1.4834, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2265193370165746, |
|
"grad_norm": 0.5085079073905945, |
|
"learning_rate": 1.8005412409243604e-05, |
|
"loss": 1.4549, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2292817679558011, |
|
"grad_norm": 0.5268322229385376, |
|
"learning_rate": 1.7951608878185533e-05, |
|
"loss": 1.4762, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.23204419889502761, |
|
"grad_norm": 0.5247999429702759, |
|
"learning_rate": 1.789717196391916e-05, |
|
"loss": 1.4625, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.23480662983425415, |
|
"grad_norm": 0.5090611577033997, |
|
"learning_rate": 1.7842106002601854e-05, |
|
"loss": 1.4634, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23756906077348067, |
|
"grad_norm": 0.500877857208252, |
|
"learning_rate": 1.778641538049755e-05, |
|
"loss": 1.5259, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.24033149171270718, |
|
"grad_norm": 0.5528935790061951, |
|
"learning_rate": 1.773010453362737e-05, |
|
"loss": 1.4783, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2430939226519337, |
|
"grad_norm": 0.4904051125049591, |
|
"learning_rate": 1.7673177947416258e-05, |
|
"loss": 1.5713, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.24585635359116023, |
|
"grad_norm": 0.4856030344963074, |
|
"learning_rate": 1.7615640156335713e-05, |
|
"loss": 1.5448, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.24861878453038674, |
|
"grad_norm": 0.4999338388442993, |
|
"learning_rate": 1.7557495743542586e-05, |
|
"loss": 1.5286, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2513812154696133, |
|
"grad_norm": 0.6124939322471619, |
|
"learning_rate": 1.749874934051401e-05, |
|
"loss": 1.5815, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2513812154696133, |
|
"eval_loss": 1.121329426765442, |
|
"eval_runtime": 153.7165, |
|
"eval_samples_per_second": 9.257, |
|
"eval_steps_per_second": 0.293, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2541436464088398, |
|
"grad_norm": 0.5473253726959229, |
|
"learning_rate": 1.7439405626678496e-05, |
|
"loss": 1.5358, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2569060773480663, |
|
"grad_norm": 0.515661895275116, |
|
"learning_rate": 1.7379469329043166e-05, |
|
"loss": 1.5705, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2596685082872928, |
|
"grad_norm": 0.5037718415260315, |
|
"learning_rate": 1.7318945221817255e-05, |
|
"loss": 1.5362, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.26243093922651933, |
|
"grad_norm": 0.5043213367462158, |
|
"learning_rate": 1.7257838126031797e-05, |
|
"loss": 1.5082, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.26519337016574585, |
|
"grad_norm": 0.5211421847343445, |
|
"learning_rate": 1.719615290915563e-05, |
|
"loss": 1.5486, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.26795580110497236, |
|
"grad_norm": 0.5876896381378174, |
|
"learning_rate": 1.7133894484707657e-05, |
|
"loss": 1.4926, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.27071823204419887, |
|
"grad_norm": 0.5891074538230896, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 1.5895, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.27348066298342544, |
|
"grad_norm": 0.539527952671051, |
|
"learning_rate": 1.7007677895070358e-05, |
|
"loss": 1.4588, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.27624309392265195, |
|
"grad_norm": 0.4934506416320801, |
|
"learning_rate": 1.694372978362861e-05, |
|
"loss": 1.5116, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27900552486187846, |
|
"grad_norm": 0.5579091906547546, |
|
"learning_rate": 1.6879228571309377e-05, |
|
"loss": 1.4786, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.281767955801105, |
|
"grad_norm": 0.5706738233566284, |
|
"learning_rate": 1.6814179395938915e-05, |
|
"loss": 1.549, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2845303867403315, |
|
"grad_norm": 0.5942355394363403, |
|
"learning_rate": 1.6748587438991303e-05, |
|
"loss": 1.4979, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.287292817679558, |
|
"grad_norm": 0.5835041403770447, |
|
"learning_rate": 1.6682457925175762e-05, |
|
"loss": 1.4915, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2900552486187845, |
|
"grad_norm": 0.6035829782485962, |
|
"learning_rate": 1.6615796122020443e-05, |
|
"loss": 1.5754, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.292817679558011, |
|
"grad_norm": 0.5598711967468262, |
|
"learning_rate": 1.6548607339452853e-05, |
|
"loss": 1.5017, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2955801104972376, |
|
"grad_norm": 0.5402417778968811, |
|
"learning_rate": 1.6480896929376905e-05, |
|
"loss": 1.5558, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2983425414364641, |
|
"grad_norm": 0.6131693720817566, |
|
"learning_rate": 1.641267028524661e-05, |
|
"loss": 1.532, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3011049723756906, |
|
"grad_norm": 0.5868387222290039, |
|
"learning_rate": 1.6343932841636455e-05, |
|
"loss": 1.4656, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.30386740331491713, |
|
"grad_norm": 0.5688586831092834, |
|
"learning_rate": 1.627469007380852e-05, |
|
"loss": 1.5049, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30662983425414364, |
|
"grad_norm": 0.5797792673110962, |
|
"learning_rate": 1.6204947497276346e-05, |
|
"loss": 1.5484, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.30939226519337015, |
|
"grad_norm": 0.5769445300102234, |
|
"learning_rate": 1.6134710667365598e-05, |
|
"loss": 1.4402, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.31215469613259667, |
|
"grad_norm": 0.6333233118057251, |
|
"learning_rate": 1.6063985178771555e-05, |
|
"loss": 1.4678, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3149171270718232, |
|
"grad_norm": 0.558728814125061, |
|
"learning_rate": 1.599277666511347e-05, |
|
"loss": 1.4787, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.31767955801104975, |
|
"grad_norm": 0.5599526166915894, |
|
"learning_rate": 1.592109079848583e-05, |
|
"loss": 1.5417, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.32044198895027626, |
|
"grad_norm": 0.5926061272621155, |
|
"learning_rate": 1.584893328900653e-05, |
|
"loss": 1.3919, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.32320441988950277, |
|
"grad_norm": 0.589717447757721, |
|
"learning_rate": 1.577630988436206e-05, |
|
"loss": 1.5362, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3259668508287293, |
|
"grad_norm": 0.6871376037597656, |
|
"learning_rate": 1.5703226369349642e-05, |
|
"loss": 1.4358, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3287292817679558, |
|
"grad_norm": 0.607738733291626, |
|
"learning_rate": 1.562968856541648e-05, |
|
"loss": 1.5095, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3314917127071823, |
|
"grad_norm": 0.635498583316803, |
|
"learning_rate": 1.5555702330196024e-05, |
|
"loss": 1.4874, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3342541436464088, |
|
"grad_norm": 0.6137527227401733, |
|
"learning_rate": 1.5481273557041402e-05, |
|
"loss": 1.4166, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3370165745856354, |
|
"grad_norm": 0.6075506210327148, |
|
"learning_rate": 1.5406408174555978e-05, |
|
"loss": 1.5638, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3397790055248619, |
|
"grad_norm": 0.6399998068809509, |
|
"learning_rate": 1.5331112146121104e-05, |
|
"loss": 1.503, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3425414364640884, |
|
"grad_norm": 0.5782871246337891, |
|
"learning_rate": 1.525539146942113e-05, |
|
"loss": 1.481, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3453038674033149, |
|
"grad_norm": 0.6390048265457153, |
|
"learning_rate": 1.5179252175965632e-05, |
|
"loss": 1.4298, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.34806629834254144, |
|
"grad_norm": 0.5457322001457214, |
|
"learning_rate": 1.5102700330609e-05, |
|
"loss": 1.4868, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.35082872928176795, |
|
"grad_norm": 0.5977615118026733, |
|
"learning_rate": 1.5025742031067316e-05, |
|
"loss": 1.4753, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.35359116022099446, |
|
"grad_norm": 0.6722098588943481, |
|
"learning_rate": 1.4948383407432678e-05, |
|
"loss": 1.5022, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.356353591160221, |
|
"grad_norm": 0.6676556468009949, |
|
"learning_rate": 1.4870630621684873e-05, |
|
"loss": 1.4862, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.35911602209944754, |
|
"grad_norm": 0.6139523386955261, |
|
"learning_rate": 1.479248986720057e-05, |
|
"loss": 1.4543, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.36187845303867405, |
|
"grad_norm": 0.621616542339325, |
|
"learning_rate": 1.4713967368259981e-05, |
|
"loss": 1.4795, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.36464088397790057, |
|
"grad_norm": 0.6133718490600586, |
|
"learning_rate": 1.4635069379551054e-05, |
|
"loss": 1.4821, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3674033149171271, |
|
"grad_norm": 0.7033741474151611, |
|
"learning_rate": 1.4555802185671297e-05, |
|
"loss": 1.5079, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3701657458563536, |
|
"grad_norm": 0.6731663942337036, |
|
"learning_rate": 1.4476172100627127e-05, |
|
"loss": 1.4438, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3729281767955801, |
|
"grad_norm": 0.6156182885169983, |
|
"learning_rate": 1.4396185467330974e-05, |
|
"loss": 1.5067, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3756906077348066, |
|
"grad_norm": 0.6311376690864563, |
|
"learning_rate": 1.4315848657096006e-05, |
|
"loss": 1.4958, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3784530386740331, |
|
"grad_norm": 0.6299065947532654, |
|
"learning_rate": 1.4235168069128657e-05, |
|
"loss": 1.4514, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3812154696132597, |
|
"grad_norm": 0.7021288275718689, |
|
"learning_rate": 1.4154150130018867e-05, |
|
"loss": 1.4633, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3839779005524862, |
|
"grad_norm": 0.6808854937553406, |
|
"learning_rate": 1.407280129322819e-05, |
|
"loss": 1.5116, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3867403314917127, |
|
"grad_norm": 0.6397327780723572, |
|
"learning_rate": 1.3991128038575741e-05, |
|
"loss": 1.4773, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.38950276243093923, |
|
"grad_norm": 0.6835840344429016, |
|
"learning_rate": 1.3909136871722066e-05, |
|
"loss": 1.4515, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.39226519337016574, |
|
"grad_norm": 0.6662083268165588, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 1.4796, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.39502762430939226, |
|
"grad_norm": 0.6650438904762268, |
|
"learning_rate": 1.374422695014897e-05, |
|
"loss": 1.5343, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.39779005524861877, |
|
"grad_norm": 0.6556645035743713, |
|
"learning_rate": 1.3661321331283796e-05, |
|
"loss": 1.5149, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4005524861878453, |
|
"grad_norm": 0.7009831666946411, |
|
"learning_rate": 1.3578124070879534e-05, |
|
"loss": 1.3801, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.40331491712707185, |
|
"grad_norm": 0.6953743696212769, |
|
"learning_rate": 1.3494641795990986e-05, |
|
"loss": 1.3673, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.40607734806629836, |
|
"grad_norm": 0.7101163268089294, |
|
"learning_rate": 1.3410881156375684e-05, |
|
"loss": 1.4491, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4088397790055249, |
|
"grad_norm": 0.6939963102340698, |
|
"learning_rate": 1.3326848823964243e-05, |
|
"loss": 1.4347, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4116022099447514, |
|
"grad_norm": 0.7195811867713928, |
|
"learning_rate": 1.3242551492328875e-05, |
|
"loss": 1.4454, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4143646408839779, |
|
"grad_norm": 0.7362983226776123, |
|
"learning_rate": 1.3157995876150252e-05, |
|
"loss": 1.4402, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4171270718232044, |
|
"grad_norm": 0.706754744052887, |
|
"learning_rate": 1.3073188710682612e-05, |
|
"loss": 1.2887, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4198895027624309, |
|
"grad_norm": 0.7431464195251465, |
|
"learning_rate": 1.2988136751217292e-05, |
|
"loss": 1.4228, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.42265193370165743, |
|
"grad_norm": 0.7429276704788208, |
|
"learning_rate": 1.2902846772544625e-05, |
|
"loss": 1.5767, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.425414364640884, |
|
"grad_norm": 0.672516405582428, |
|
"learning_rate": 1.2817325568414299e-05, |
|
"loss": 1.4013, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4281767955801105, |
|
"grad_norm": 0.7116546630859375, |
|
"learning_rate": 1.27315799509942e-05, |
|
"loss": 1.4784, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.430939226519337, |
|
"grad_norm": 0.7850514054298401, |
|
"learning_rate": 1.2645616750327792e-05, |
|
"loss": 1.4335, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.43370165745856354, |
|
"grad_norm": 0.7218865752220154, |
|
"learning_rate": 1.2559442813790077e-05, |
|
"loss": 1.4628, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.43646408839779005, |
|
"grad_norm": 0.746462881565094, |
|
"learning_rate": 1.2473065005542155e-05, |
|
"loss": 1.4531, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.43922651933701656, |
|
"grad_norm": 0.7100480794906616, |
|
"learning_rate": 1.2386490205984488e-05, |
|
"loss": 1.4729, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4419889502762431, |
|
"grad_norm": 0.7361482977867126, |
|
"learning_rate": 1.2299725311208807e-05, |
|
"loss": 1.5175, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4447513812154696, |
|
"grad_norm": 0.7085789442062378, |
|
"learning_rate": 1.2212777232448837e-05, |
|
"loss": 1.4351, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.44751381215469616, |
|
"grad_norm": 0.6986429691314697, |
|
"learning_rate": 1.2125652895529766e-05, |
|
"loss": 1.484, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.45027624309392267, |
|
"grad_norm": 0.656014084815979, |
|
"learning_rate": 1.2038359240316589e-05, |
|
"loss": 1.4858, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4530386740331492, |
|
"grad_norm": 0.7168847322463989, |
|
"learning_rate": 1.1950903220161286e-05, |
|
"loss": 1.4222, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4558011049723757, |
|
"grad_norm": 0.6796424388885498, |
|
"learning_rate": 1.186329180134898e-05, |
|
"loss": 1.5088, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4585635359116022, |
|
"grad_norm": 0.6602550148963928, |
|
"learning_rate": 1.1775531962543036e-05, |
|
"loss": 1.2236, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4613259668508287, |
|
"grad_norm": 0.7656331062316895, |
|
"learning_rate": 1.1687630694229159e-05, |
|
"loss": 1.4906, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.46408839779005523, |
|
"grad_norm": 0.7794011235237122, |
|
"learning_rate": 1.1599594998158602e-05, |
|
"loss": 1.4335, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.46685082872928174, |
|
"grad_norm": 0.7844555377960205, |
|
"learning_rate": 1.1511431886790407e-05, |
|
"loss": 1.3969, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4696132596685083, |
|
"grad_norm": 0.6949150562286377, |
|
"learning_rate": 1.1423148382732854e-05, |
|
"loss": 1.4488, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4723756906077348, |
|
"grad_norm": 0.866104781627655, |
|
"learning_rate": 1.1334751518184062e-05, |
|
"loss": 1.3399, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.47513812154696133, |
|
"grad_norm": 0.763599157333374, |
|
"learning_rate": 1.124624833437186e-05, |
|
"loss": 1.5064, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.47790055248618785, |
|
"grad_norm": 0.7990726232528687, |
|
"learning_rate": 1.1157645880992901e-05, |
|
"loss": 1.4328, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.48066298342541436, |
|
"grad_norm": 0.782218873500824, |
|
"learning_rate": 1.1068951215651132e-05, |
|
"loss": 1.3896, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.48342541436464087, |
|
"grad_norm": 0.741022527217865, |
|
"learning_rate": 1.098017140329561e-05, |
|
"loss": 1.3803, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4861878453038674, |
|
"grad_norm": 0.8042486310005188, |
|
"learning_rate": 1.089131351565776e-05, |
|
"loss": 1.4333, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4889502762430939, |
|
"grad_norm": 0.7474654316902161, |
|
"learning_rate": 1.080238463068808e-05, |
|
"loss": 1.4138, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.49171270718232046, |
|
"grad_norm": 0.7193496227264404, |
|
"learning_rate": 1.0713391831992324e-05, |
|
"loss": 1.4721, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.494475138121547, |
|
"grad_norm": 0.7830453515052795, |
|
"learning_rate": 1.0624342208267293e-05, |
|
"loss": 1.475, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4972375690607735, |
|
"grad_norm": 0.6899390816688538, |
|
"learning_rate": 1.0535242852736152e-05, |
|
"loss": 1.3806, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.779139518737793, |
|
"learning_rate": 1.0446100862583459e-05, |
|
"loss": 1.4362, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5027624309392266, |
|
"grad_norm": 0.7901574969291687, |
|
"learning_rate": 1.0356923338389807e-05, |
|
"loss": 1.5008, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5027624309392266, |
|
"eval_loss": 1.09184730052948, |
|
"eval_runtime": 154.1688, |
|
"eval_samples_per_second": 9.23, |
|
"eval_steps_per_second": 0.292, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.505524861878453, |
|
"grad_norm": 0.7908266186714172, |
|
"learning_rate": 1.0267717383566247e-05, |
|
"loss": 1.4892, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5082872928176796, |
|
"grad_norm": 0.7942709922790527, |
|
"learning_rate": 1.0178490103788462e-05, |
|
"loss": 1.391, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.511049723756906, |
|
"grad_norm": 0.76580411195755, |
|
"learning_rate": 1.0089248606430775e-05, |
|
"loss": 1.382, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5138121546961326, |
|
"grad_norm": 0.7731238603591919, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4122, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5165745856353591, |
|
"grad_norm": 0.8293752074241638, |
|
"learning_rate": 9.910751393569228e-06, |
|
"loss": 1.5193, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5193370165745856, |
|
"grad_norm": 0.9073200225830078, |
|
"learning_rate": 9.82150989621154e-06, |
|
"loss": 1.3044, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5220994475138122, |
|
"grad_norm": 0.7942474484443665, |
|
"learning_rate": 9.732282616433756e-06, |
|
"loss": 1.4997, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5248618784530387, |
|
"grad_norm": 0.7780297994613647, |
|
"learning_rate": 9.643076661610197e-06, |
|
"loss": 1.4907, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5276243093922652, |
|
"grad_norm": 0.8634337782859802, |
|
"learning_rate": 9.553899137416546e-06, |
|
"loss": 1.3418, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5303867403314917, |
|
"grad_norm": 0.7954802513122559, |
|
"learning_rate": 9.464757147263849e-06, |
|
"loss": 1.4508, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5331491712707183, |
|
"grad_norm": 0.8032029271125793, |
|
"learning_rate": 9.37565779173271e-06, |
|
"loss": 1.4302, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5359116022099447, |
|
"grad_norm": 0.7397053837776184, |
|
"learning_rate": 9.286608168007678e-06, |
|
"loss": 1.5004, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5386740331491713, |
|
"grad_norm": 0.9212160706520081, |
|
"learning_rate": 9.197615369311926e-06, |
|
"loss": 1.4778, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5414364640883977, |
|
"grad_norm": 0.779822587966919, |
|
"learning_rate": 9.108686484342241e-06, |
|
"loss": 1.4482, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5441988950276243, |
|
"grad_norm": 0.7739251852035522, |
|
"learning_rate": 9.019828596704394e-06, |
|
"loss": 1.4599, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5469613259668509, |
|
"grad_norm": 0.7287247776985168, |
|
"learning_rate": 8.931048784348875e-06, |
|
"loss": 1.4667, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5497237569060773, |
|
"grad_norm": 0.758631706237793, |
|
"learning_rate": 8.8423541190071e-06, |
|
"loss": 1.3432, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5524861878453039, |
|
"grad_norm": 0.7681453824043274, |
|
"learning_rate": 8.753751665628141e-06, |
|
"loss": 1.5143, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5552486187845304, |
|
"grad_norm": 0.8416883945465088, |
|
"learning_rate": 8.665248481815941e-06, |
|
"loss": 1.362, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5580110497237569, |
|
"grad_norm": 0.7365747094154358, |
|
"learning_rate": 8.576851617267151e-06, |
|
"loss": 1.4192, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5607734806629834, |
|
"grad_norm": 0.8545944690704346, |
|
"learning_rate": 8.488568113209593e-06, |
|
"loss": 1.4204, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.56353591160221, |
|
"grad_norm": 0.8342128396034241, |
|
"learning_rate": 8.4004050018414e-06, |
|
"loss": 1.3981, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5662983425414365, |
|
"grad_norm": 0.7602656483650208, |
|
"learning_rate": 8.312369305770843e-06, |
|
"loss": 1.437, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.569060773480663, |
|
"grad_norm": 0.8728726506233215, |
|
"learning_rate": 8.224468037456969e-06, |
|
"loss": 1.332, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5718232044198895, |
|
"grad_norm": 0.845259428024292, |
|
"learning_rate": 8.136708198651022e-06, |
|
"loss": 1.4522, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.574585635359116, |
|
"grad_norm": 0.8054398894309998, |
|
"learning_rate": 8.04909677983872e-06, |
|
"loss": 1.3441, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5773480662983426, |
|
"grad_norm": 0.7871323227882385, |
|
"learning_rate": 7.961640759683416e-06, |
|
"loss": 1.4925, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.580110497237569, |
|
"grad_norm": 0.8431714177131653, |
|
"learning_rate": 7.874347104470234e-06, |
|
"loss": 1.3429, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5828729281767956, |
|
"grad_norm": 0.8485934138298035, |
|
"learning_rate": 7.787222767551164e-06, |
|
"loss": 1.4277, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.585635359116022, |
|
"grad_norm": 0.8744496703147888, |
|
"learning_rate": 7.700274688791196e-06, |
|
"loss": 1.436, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5883977900552486, |
|
"grad_norm": 0.8178616762161255, |
|
"learning_rate": 7.613509794015517e-06, |
|
"loss": 1.4106, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5911602209944752, |
|
"grad_norm": 0.8785072565078735, |
|
"learning_rate": 7.5269349944578454e-06, |
|
"loss": 1.461, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5939226519337016, |
|
"grad_norm": 0.7647153735160828, |
|
"learning_rate": 7.440557186209927e-06, |
|
"loss": 1.3674, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5966850828729282, |
|
"grad_norm": 0.7870045900344849, |
|
"learning_rate": 7.354383249672212e-06, |
|
"loss": 1.4552, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5994475138121547, |
|
"grad_norm": 0.791970431804657, |
|
"learning_rate": 7.268420049005806e-06, |
|
"loss": 1.4316, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6022099447513812, |
|
"grad_norm": 0.8452421426773071, |
|
"learning_rate": 7.182674431585703e-06, |
|
"loss": 1.4539, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6049723756906077, |
|
"grad_norm": 0.9066639542579651, |
|
"learning_rate": 7.097153227455379e-06, |
|
"loss": 1.4379, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6077348066298343, |
|
"grad_norm": 0.8649947643280029, |
|
"learning_rate": 7.011863248782711e-06, |
|
"loss": 1.3483, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6104972375690608, |
|
"grad_norm": 0.8475552201271057, |
|
"learning_rate": 6.92681128931739e-06, |
|
"loss": 1.4798, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6132596685082873, |
|
"grad_norm": 0.8348631262779236, |
|
"learning_rate": 6.8420041238497525e-06, |
|
"loss": 1.4084, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6160220994475138, |
|
"grad_norm": 0.9010429382324219, |
|
"learning_rate": 6.7574485076711285e-06, |
|
"loss": 1.4011, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6187845303867403, |
|
"grad_norm": 0.8535116910934448, |
|
"learning_rate": 6.673151176035762e-06, |
|
"loss": 1.3661, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6215469613259669, |
|
"grad_norm": 0.8269426822662354, |
|
"learning_rate": 6.589118843624316e-06, |
|
"loss": 1.3894, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6243093922651933, |
|
"grad_norm": 0.76876300573349, |
|
"learning_rate": 6.505358204009018e-06, |
|
"loss": 1.4496, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6270718232044199, |
|
"grad_norm": 0.7975645065307617, |
|
"learning_rate": 6.421875929120469e-06, |
|
"loss": 1.3987, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6298342541436464, |
|
"grad_norm": 0.8084754347801208, |
|
"learning_rate": 6.33867866871621e-06, |
|
"loss": 1.4141, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6325966850828729, |
|
"grad_norm": 0.7989472150802612, |
|
"learning_rate": 6.25577304985103e-06, |
|
"loss": 1.2641, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6353591160220995, |
|
"grad_norm": 0.8317887783050537, |
|
"learning_rate": 6.173165676349103e-06, |
|
"loss": 1.3783, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.638121546961326, |
|
"grad_norm": 0.883558452129364, |
|
"learning_rate": 6.090863128277938e-06, |
|
"loss": 1.3727, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6408839779005525, |
|
"grad_norm": 0.7857452630996704, |
|
"learning_rate": 6.008871961424259e-06, |
|
"loss": 1.4159, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.643646408839779, |
|
"grad_norm": 0.7758345007896423, |
|
"learning_rate": 5.927198706771813e-06, |
|
"loss": 1.4271, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6464088397790055, |
|
"grad_norm": 0.8178966045379639, |
|
"learning_rate": 5.845849869981137e-06, |
|
"loss": 1.4526, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.649171270718232, |
|
"grad_norm": 0.8668413758277893, |
|
"learning_rate": 5.764831930871346e-06, |
|
"loss": 1.4209, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6519337016574586, |
|
"grad_norm": 0.866632342338562, |
|
"learning_rate": 5.684151342903992e-06, |
|
"loss": 1.4785, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6546961325966851, |
|
"grad_norm": 0.8386754393577576, |
|
"learning_rate": 5.603814532669032e-06, |
|
"loss": 1.4133, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6574585635359116, |
|
"grad_norm": 0.8321985006332397, |
|
"learning_rate": 5.523827899372876e-06, |
|
"loss": 1.4049, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6602209944751382, |
|
"grad_norm": 0.8785558342933655, |
|
"learning_rate": 5.444197814328707e-06, |
|
"loss": 1.3697, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6629834254143646, |
|
"grad_norm": 0.7912325263023376, |
|
"learning_rate": 5.364930620448946e-06, |
|
"loss": 1.4928, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6657458563535912, |
|
"grad_norm": 0.8722138404846191, |
|
"learning_rate": 5.286032631740023e-06, |
|
"loss": 1.4547, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6685082872928176, |
|
"grad_norm": 0.8593927621841431, |
|
"learning_rate": 5.207510132799436e-06, |
|
"loss": 1.4188, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6712707182320442, |
|
"grad_norm": 0.7740910649299622, |
|
"learning_rate": 5.129369378315128e-06, |
|
"loss": 1.4642, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6740331491712708, |
|
"grad_norm": 0.8590474724769592, |
|
"learning_rate": 5.051616592567323e-06, |
|
"loss": 1.2543, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6767955801104972, |
|
"grad_norm": 0.862617015838623, |
|
"learning_rate": 4.974257968932687e-06, |
|
"loss": 1.4393, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6795580110497238, |
|
"grad_norm": 0.8476159572601318, |
|
"learning_rate": 4.897299669391006e-06, |
|
"loss": 1.4986, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6823204419889503, |
|
"grad_norm": 0.8105822205543518, |
|
"learning_rate": 4.820747824034369e-06, |
|
"loss": 1.5027, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6850828729281768, |
|
"grad_norm": 0.9459635615348816, |
|
"learning_rate": 4.744608530578872e-06, |
|
"loss": 1.3336, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6878453038674033, |
|
"grad_norm": 0.8690259456634521, |
|
"learning_rate": 4.668887853878896e-06, |
|
"loss": 1.3479, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6906077348066298, |
|
"grad_norm": 0.8513994812965393, |
|
"learning_rate": 4.593591825444028e-06, |
|
"loss": 1.5261, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6933701657458563, |
|
"grad_norm": 0.8456645011901855, |
|
"learning_rate": 4.518726442958599e-06, |
|
"loss": 1.4669, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6961325966850829, |
|
"grad_norm": 0.8776260018348694, |
|
"learning_rate": 4.444297669803981e-06, |
|
"loss": 1.4177, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6988950276243094, |
|
"grad_norm": 0.860339343547821, |
|
"learning_rate": 4.370311434583525e-06, |
|
"loss": 1.454, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.7016574585635359, |
|
"grad_norm": 0.902574360370636, |
|
"learning_rate": 4.296773630650358e-06, |
|
"loss": 1.3987, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7044198895027625, |
|
"grad_norm": 0.8529084920883179, |
|
"learning_rate": 4.223690115637944e-06, |
|
"loss": 1.4428, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7071823204419889, |
|
"grad_norm": 0.8335188627243042, |
|
"learning_rate": 4.15106671099347e-06, |
|
"loss": 1.4812, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7099447513812155, |
|
"grad_norm": 0.8945775628089905, |
|
"learning_rate": 4.078909201514172e-06, |
|
"loss": 1.3559, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.712707182320442, |
|
"grad_norm": 0.9545679688453674, |
|
"learning_rate": 4.007223334886531e-06, |
|
"loss": 1.3806, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7154696132596685, |
|
"grad_norm": 0.7985166311264038, |
|
"learning_rate": 3.936014821228448e-06, |
|
"loss": 1.3433, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7182320441988951, |
|
"grad_norm": 0.8410876393318176, |
|
"learning_rate": 3.865289332634407e-06, |
|
"loss": 1.4393, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7209944751381215, |
|
"grad_norm": 0.8620865941047668, |
|
"learning_rate": 3.7950525027236585e-06, |
|
"loss": 1.2242, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7237569060773481, |
|
"grad_norm": 0.8445248603820801, |
|
"learning_rate": 3.7253099261914794e-06, |
|
"loss": 1.403, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7265193370165746, |
|
"grad_norm": 0.8497399687767029, |
|
"learning_rate": 3.6560671583635467e-06, |
|
"loss": 1.4453, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7292817679558011, |
|
"grad_norm": 0.83712238073349, |
|
"learning_rate": 3.5873297147533913e-06, |
|
"loss": 1.503, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7320441988950276, |
|
"grad_norm": 0.8133417963981628, |
|
"learning_rate": 3.5191030706230967e-06, |
|
"loss": 1.3821, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7348066298342542, |
|
"grad_norm": 0.8870840668678284, |
|
"learning_rate": 3.4513926605471504e-06, |
|
"loss": 1.4533, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7375690607734806, |
|
"grad_norm": 0.9070349335670471, |
|
"learning_rate": 3.3842038779795594e-06, |
|
"loss": 1.4055, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7403314917127072, |
|
"grad_norm": 0.8012280464172363, |
|
"learning_rate": 3.3175420748242405e-06, |
|
"loss": 1.2719, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7430939226519337, |
|
"grad_norm": 0.8506048321723938, |
|
"learning_rate": 3.2514125610086957e-06, |
|
"loss": 1.3588, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7458563535911602, |
|
"grad_norm": 0.7189351916313171, |
|
"learning_rate": 3.1858206040610883e-06, |
|
"loss": 1.2039, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7486187845303868, |
|
"grad_norm": 0.8134811520576477, |
|
"learning_rate": 3.1207714286906253e-06, |
|
"loss": 1.2114, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7513812154696132, |
|
"grad_norm": 0.8526425957679749, |
|
"learning_rate": 3.0562702163713954e-06, |
|
"loss": 1.5044, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7541436464088398, |
|
"grad_norm": 0.9256671071052551, |
|
"learning_rate": 2.9923221049296448e-06, |
|
"loss": 1.3762, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7541436464088398, |
|
"eval_loss": 1.0853379964828491, |
|
"eval_runtime": 155.2763, |
|
"eval_samples_per_second": 9.164, |
|
"eval_steps_per_second": 0.29, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7569060773480663, |
|
"grad_norm": 1.0012723207473755, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 1.4391, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7596685082872928, |
|
"grad_norm": 0.9301908016204834, |
|
"learning_rate": 2.8661055152923456e-06, |
|
"loss": 1.3913, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7624309392265194, |
|
"grad_norm": 0.9066746830940247, |
|
"learning_rate": 2.8038470908443717e-06, |
|
"loss": 1.3527, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7651933701657458, |
|
"grad_norm": 0.8317292332649231, |
|
"learning_rate": 2.742161873968202e-06, |
|
"loss": 1.4783, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7679558011049724, |
|
"grad_norm": 0.892299473285675, |
|
"learning_rate": 2.681054778182748e-06, |
|
"loss": 1.3643, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7707182320441989, |
|
"grad_norm": 0.8800353407859802, |
|
"learning_rate": 2.6205306709568358e-06, |
|
"loss": 1.4015, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7734806629834254, |
|
"grad_norm": 0.8604403138160706, |
|
"learning_rate": 2.5605943733215044e-06, |
|
"loss": 1.4247, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7762430939226519, |
|
"grad_norm": 0.8747833967208862, |
|
"learning_rate": 2.501250659485992e-06, |
|
"loss": 1.3748, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7790055248618785, |
|
"grad_norm": 0.8854520320892334, |
|
"learning_rate": 2.4425042564574186e-06, |
|
"loss": 1.4518, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7817679558011049, |
|
"grad_norm": 0.8976154327392578, |
|
"learning_rate": 2.38435984366429e-06, |
|
"loss": 1.3765, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7845303867403315, |
|
"grad_norm": 0.8289183378219604, |
|
"learning_rate": 2.3268220525837436e-06, |
|
"loss": 1.4559, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.787292817679558, |
|
"grad_norm": 0.8268212080001831, |
|
"learning_rate": 2.26989546637263e-06, |
|
"loss": 1.4105, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7900552486187845, |
|
"grad_norm": 0.7702677249908447, |
|
"learning_rate": 2.213584619502451e-06, |
|
"loss": 1.4579, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7928176795580111, |
|
"grad_norm": 0.9030853509902954, |
|
"learning_rate": 2.157893997398146e-06, |
|
"loss": 1.3844, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7955801104972375, |
|
"grad_norm": 0.8461067080497742, |
|
"learning_rate": 2.1028280360808405e-06, |
|
"loss": 1.4283, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7983425414364641, |
|
"grad_norm": 0.860618531703949, |
|
"learning_rate": 2.0483911218144713e-06, |
|
"loss": 1.4576, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.8011049723756906, |
|
"grad_norm": 0.9135028123855591, |
|
"learning_rate": 1.994587590756397e-06, |
|
"loss": 1.3549, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8038674033149171, |
|
"grad_norm": 0.936713457107544, |
|
"learning_rate": 1.941421728612023e-06, |
|
"loss": 1.4119, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8066298342541437, |
|
"grad_norm": 0.9670917987823486, |
|
"learning_rate": 1.8888977702934086e-06, |
|
"loss": 1.4523, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8093922651933702, |
|
"grad_norm": 0.927078902721405, |
|
"learning_rate": 1.8370198995819432e-06, |
|
"loss": 1.405, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8121546961325967, |
|
"grad_norm": 0.8833804130554199, |
|
"learning_rate": 1.7857922487950873e-06, |
|
"loss": 1.4527, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8149171270718232, |
|
"grad_norm": 0.821182131767273, |
|
"learning_rate": 1.7352188984572026e-06, |
|
"loss": 1.3541, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8176795580110497, |
|
"grad_norm": 0.8782601952552795, |
|
"learning_rate": 1.6853038769745466e-06, |
|
"loss": 1.45, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8204419889502762, |
|
"grad_norm": 0.8677568435668945, |
|
"learning_rate": 1.6360511603143648e-06, |
|
"loss": 1.4061, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8232044198895028, |
|
"grad_norm": 0.9125531911849976, |
|
"learning_rate": 1.587464671688187e-06, |
|
"loss": 1.4138, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8259668508287292, |
|
"grad_norm": 0.8667160272598267, |
|
"learning_rate": 1.5395482812393513e-06, |
|
"loss": 1.4394, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8287292817679558, |
|
"grad_norm": 0.9092755913734436, |
|
"learning_rate": 1.492305805734693e-06, |
|
"loss": 1.4223, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8314917127071824, |
|
"grad_norm": 0.813664436340332, |
|
"learning_rate": 1.4457410082605483e-06, |
|
"loss": 1.4421, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8342541436464088, |
|
"grad_norm": 0.9390541911125183, |
|
"learning_rate": 1.3998575979229944e-06, |
|
"loss": 1.3292, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8370165745856354, |
|
"grad_norm": 0.9930654168128967, |
|
"learning_rate": 1.3546592295524075e-06, |
|
"loss": 1.4802, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8397790055248618, |
|
"grad_norm": 0.8965404629707336, |
|
"learning_rate": 1.3101495034123313e-06, |
|
"loss": 1.4596, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8425414364640884, |
|
"grad_norm": 0.9070413112640381, |
|
"learning_rate": 1.2663319649127025e-06, |
|
"loss": 1.4076, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8453038674033149, |
|
"grad_norm": 0.7749320268630981, |
|
"learning_rate": 1.2232101043274437e-06, |
|
"loss": 1.2553, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8480662983425414, |
|
"grad_norm": 0.949621856212616, |
|
"learning_rate": 1.1807873565164507e-06, |
|
"loss": 1.3909, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.850828729281768, |
|
"grad_norm": 0.8850001096725464, |
|
"learning_rate": 1.139067100651976e-06, |
|
"loss": 1.3992, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8535911602209945, |
|
"grad_norm": 0.8339431285858154, |
|
"learning_rate": 1.0980526599494733e-06, |
|
"loss": 1.4141, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.856353591160221, |
|
"grad_norm": 0.8964627385139465, |
|
"learning_rate": 1.0577473014028872e-06, |
|
"loss": 1.3828, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8591160220994475, |
|
"grad_norm": 0.8783918023109436, |
|
"learning_rate": 1.0181542355244167e-06, |
|
"loss": 1.3791, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.861878453038674, |
|
"grad_norm": 0.8714961409568787, |
|
"learning_rate": 9.792766160887868e-07, |
|
"loss": 1.4513, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8646408839779005, |
|
"grad_norm": 0.8861087560653687, |
|
"learning_rate": 9.411175398820271e-07, |
|
"loss": 1.3653, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8674033149171271, |
|
"grad_norm": 0.8015440702438354, |
|
"learning_rate": 9.036800464548157e-07, |
|
"loss": 1.4818, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8701657458563536, |
|
"grad_norm": 0.8085585832595825, |
|
"learning_rate": 8.669671178803485e-07, |
|
"loss": 1.4441, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8729281767955801, |
|
"grad_norm": 0.8612155318260193, |
|
"learning_rate": 8.309816785168035e-07, |
|
"loss": 1.3938, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8756906077348067, |
|
"grad_norm": 0.866340160369873, |
|
"learning_rate": 7.957265947744131e-07, |
|
"loss": 1.3557, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.8784530386740331, |
|
"grad_norm": 0.8810187578201294, |
|
"learning_rate": 7.612046748871327e-07, |
|
"loss": 1.3672, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8812154696132597, |
|
"grad_norm": 0.9028764367103577, |
|
"learning_rate": 7.274186686889539e-07, |
|
"loss": 1.411, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8839779005524862, |
|
"grad_norm": 0.9450622797012329, |
|
"learning_rate": 6.943712673948643e-07, |
|
"loss": 1.3494, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8867403314917127, |
|
"grad_norm": 0.7640666365623474, |
|
"learning_rate": 6.620651033864844e-07, |
|
"loss": 1.4108, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8895027624309392, |
|
"grad_norm": 0.8005813360214233, |
|
"learning_rate": 6.305027500023841e-07, |
|
"loss": 1.4023, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8922651933701657, |
|
"grad_norm": 0.8076258301734924, |
|
"learning_rate": 5.996867213330993e-07, |
|
"loss": 1.4143, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8950276243093923, |
|
"grad_norm": 0.9263111352920532, |
|
"learning_rate": 5.696194720208792e-07, |
|
"loss": 1.4247, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8977900552486188, |
|
"grad_norm": 0.7982999682426453, |
|
"learning_rate": 5.403033970641647e-07, |
|
"loss": 1.4022, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9005524861878453, |
|
"grad_norm": 0.8366706967353821, |
|
"learning_rate": 5.117408316268047e-07, |
|
"loss": 1.5076, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9033149171270718, |
|
"grad_norm": 0.8495275974273682, |
|
"learning_rate": 4.839340508520563e-07, |
|
"loss": 1.4251, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9060773480662984, |
|
"grad_norm": 0.8747670650482178, |
|
"learning_rate": 4.5688526968136193e-07, |
|
"loss": 1.3898, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9088397790055248, |
|
"grad_norm": 0.8361295461654663, |
|
"learning_rate": 4.305966426779118e-07, |
|
"loss": 1.4397, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9116022099447514, |
|
"grad_norm": 0.8162137866020203, |
|
"learning_rate": 4.0507026385502747e-07, |
|
"loss": 1.4023, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.914364640883978, |
|
"grad_norm": 0.8866202235221863, |
|
"learning_rate": 3.8030816650935777e-07, |
|
"loss": 1.3697, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9171270718232044, |
|
"grad_norm": 0.865460216999054, |
|
"learning_rate": 3.5631232305893047e-07, |
|
"loss": 1.4998, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.919889502762431, |
|
"grad_norm": 0.9784380197525024, |
|
"learning_rate": 3.3308464488602587e-07, |
|
"loss": 1.4262, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9226519337016574, |
|
"grad_norm": 0.8989951610565186, |
|
"learning_rate": 3.106269821849273e-07, |
|
"loss": 1.3742, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.925414364640884, |
|
"grad_norm": 0.8595617413520813, |
|
"learning_rate": 2.889411238145545e-07, |
|
"loss": 1.4038, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9281767955801105, |
|
"grad_norm": 0.8496826887130737, |
|
"learning_rate": 2.6802879715596585e-07, |
|
"loss": 1.3938, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.930939226519337, |
|
"grad_norm": 0.8510773777961731, |
|
"learning_rate": 2.478916679747623e-07, |
|
"loss": 1.4617, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9337016574585635, |
|
"grad_norm": 0.8786448240280151, |
|
"learning_rate": 2.2853134028840594e-07, |
|
"loss": 1.3615, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.93646408839779, |
|
"grad_norm": 0.9002432823181152, |
|
"learning_rate": 2.099493562384469e-07, |
|
"loss": 1.4047, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9392265193370166, |
|
"grad_norm": 0.8121012449264526, |
|
"learning_rate": 1.921471959676957e-07, |
|
"loss": 1.4307, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9419889502762431, |
|
"grad_norm": 0.8053271770477295, |
|
"learning_rate": 1.7512627750230772e-07, |
|
"loss": 1.4443, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9447513812154696, |
|
"grad_norm": 0.9050955176353455, |
|
"learning_rate": 1.5888795663883904e-07, |
|
"loss": 1.4102, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9475138121546961, |
|
"grad_norm": 0.9110968112945557, |
|
"learning_rate": 1.4343352683625412e-07, |
|
"loss": 1.4198, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9502762430939227, |
|
"grad_norm": 0.929118812084198, |
|
"learning_rate": 1.2876421911288906e-07, |
|
"loss": 1.4203, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9530386740331491, |
|
"grad_norm": 0.9060593843460083, |
|
"learning_rate": 1.148812019483958e-07, |
|
"loss": 1.3268, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9558011049723757, |
|
"grad_norm": 0.8292207717895508, |
|
"learning_rate": 1.0178558119067316e-07, |
|
"loss": 1.3877, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9585635359116023, |
|
"grad_norm": 0.8450669646263123, |
|
"learning_rate": 8.947839996777286e-08, |
|
"loss": 1.3981, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9613259668508287, |
|
"grad_norm": 0.9085505604743958, |
|
"learning_rate": 7.796063860481595e-08, |
|
"loss": 1.353, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9640883977900553, |
|
"grad_norm": 0.8617837429046631, |
|
"learning_rate": 6.723321454590093e-08, |
|
"loss": 1.4215, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9668508287292817, |
|
"grad_norm": 0.8624401092529297, |
|
"learning_rate": 5.7296982281026534e-08, |
|
"loss": 1.3668, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9696132596685083, |
|
"grad_norm": 0.8622123003005981, |
|
"learning_rate": 4.815273327803183e-08, |
|
"loss": 1.3723, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9723756906077348, |
|
"grad_norm": 0.7842567563056946, |
|
"learning_rate": 3.980119591954101e-08, |
|
"loss": 1.3255, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9751381215469613, |
|
"grad_norm": 0.8146648406982422, |
|
"learning_rate": 3.224303544495766e-08, |
|
"loss": 1.3913, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.9779005524861878, |
|
"grad_norm": 0.8488260507583618, |
|
"learning_rate": 2.547885389746485e-08, |
|
"loss": 1.385, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9806629834254144, |
|
"grad_norm": 0.8932926058769226, |
|
"learning_rate": 1.9509190076074657e-08, |
|
"loss": 1.3992, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9834254143646409, |
|
"grad_norm": 0.9198254346847534, |
|
"learning_rate": 1.4334519492711362e-08, |
|
"loss": 1.3375, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9861878453038674, |
|
"grad_norm": 0.9434136748313904, |
|
"learning_rate": 9.955254334328424e-09, |
|
"loss": 1.3329, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.988950276243094, |
|
"grad_norm": 0.8083603978157043, |
|
"learning_rate": 6.371743430082511e-09, |
|
"loss": 1.4152, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9917127071823204, |
|
"grad_norm": 0.8143340349197388, |
|
"learning_rate": 3.5842722235468475e-09, |
|
"loss": 1.3503, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.994475138121547, |
|
"grad_norm": 0.8487191200256348, |
|
"learning_rate": 1.593062749967178e-09, |
|
"loss": 1.4374, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9972375690607734, |
|
"grad_norm": 0.8385122418403625, |
|
"learning_rate": 3.982736185859093e-10, |
|
"loss": 1.384, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8801374435424805, |
|
"learning_rate": 0.0, |
|
"loss": 1.368, |
|
"step": 362 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 362, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6738557053055795e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|