|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 16.836197825324447, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5909375548362732, |
|
"learning_rate": 1.9932584269662923e-05, |
|
"loss": 2.0237, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5826025009155273, |
|
"learning_rate": 1.9857677902621722e-05, |
|
"loss": 1.9306, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5491089820861816, |
|
"learning_rate": 1.9782771535580525e-05, |
|
"loss": 1.7959, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.362810730934143, |
|
"learning_rate": 1.970786516853933e-05, |
|
"loss": 1.6599, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.4427486658096313, |
|
"learning_rate": 1.963295880149813e-05, |
|
"loss": 1.5685, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.9993659257888794, |
|
"learning_rate": 1.956179775280899e-05, |
|
"loss": 1.4621, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.614562749862671, |
|
"learning_rate": 1.9486891385767793e-05, |
|
"loss": 1.31, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1975798606872559, |
|
"learning_rate": 1.9411985018726593e-05, |
|
"loss": 1.2322, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.7684128880500793, |
|
"learning_rate": 1.9337078651685396e-05, |
|
"loss": 1.1361, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.9336960911750793, |
|
"learning_rate": 1.9262172284644195e-05, |
|
"loss": 1.0797, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.8471770882606506, |
|
"learning_rate": 1.9187265917603e-05, |
|
"loss": 1.0368, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.111340045928955, |
|
"learning_rate": 1.9112359550561798e-05, |
|
"loss": 0.9738, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.8093781471252441, |
|
"learning_rate": 1.90374531835206e-05, |
|
"loss": 0.9494, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.8438062071800232, |
|
"learning_rate": 1.89625468164794e-05, |
|
"loss": 0.9276, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.9896701574325562, |
|
"learning_rate": 1.8887640449438204e-05, |
|
"loss": 0.8656, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.8278244137763977, |
|
"learning_rate": 1.8812734082397007e-05, |
|
"loss": 0.8431, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.931291937828064, |
|
"learning_rate": 1.8737827715355807e-05, |
|
"loss": 0.7945, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.21769380569458, |
|
"learning_rate": 1.866292134831461e-05, |
|
"loss": 0.7647, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 3.5183286666870117, |
|
"learning_rate": 1.858801498127341e-05, |
|
"loss": 0.7497, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.1153030395507812, |
|
"learning_rate": 1.8513108614232212e-05, |
|
"loss": 0.7507, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.0140526294708252, |
|
"learning_rate": 1.8438202247191012e-05, |
|
"loss": 0.7415, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.4395232200622559, |
|
"learning_rate": 1.8363295880149815e-05, |
|
"loss": 0.6947, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.4253089427947998, |
|
"learning_rate": 1.8288389513108615e-05, |
|
"loss": 0.7429, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.3152351379394531, |
|
"learning_rate": 1.8213483146067418e-05, |
|
"loss": 0.7363, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.5935957431793213, |
|
"learning_rate": 1.8138576779026217e-05, |
|
"loss": 0.6486, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 3.929158926010132, |
|
"learning_rate": 1.806367041198502e-05, |
|
"loss": 0.6395, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.7316572666168213, |
|
"learning_rate": 1.7988764044943823e-05, |
|
"loss": 0.664, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 1.3388841152191162, |
|
"learning_rate": 1.7913857677902623e-05, |
|
"loss": 0.6469, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.5258549451828003, |
|
"learning_rate": 1.7838951310861426e-05, |
|
"loss": 0.6662, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.5486094951629639, |
|
"learning_rate": 1.7764044943820226e-05, |
|
"loss": 0.566, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.5657902956008911, |
|
"learning_rate": 1.768913857677903e-05, |
|
"loss": 0.6166, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 1.5971391201019287, |
|
"learning_rate": 1.761423220973783e-05, |
|
"loss": 0.5973, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.333030343055725, |
|
"learning_rate": 1.753932584269663e-05, |
|
"loss": 0.6117, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 1.4425445795059204, |
|
"learning_rate": 1.746441947565543e-05, |
|
"loss": 0.5702, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 1.4773032665252686, |
|
"learning_rate": 1.7389513108614234e-05, |
|
"loss": 0.5465, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 1.3328267335891724, |
|
"learning_rate": 1.7314606741573034e-05, |
|
"loss": 0.5379, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 1.6961455345153809, |
|
"learning_rate": 1.7239700374531837e-05, |
|
"loss": 0.5492, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 1.4636189937591553, |
|
"learning_rate": 1.7164794007490637e-05, |
|
"loss": 0.547, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 2.1686649322509766, |
|
"learning_rate": 1.708988764044944e-05, |
|
"loss": 0.5424, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 1.219388723373413, |
|
"learning_rate": 1.7014981273408243e-05, |
|
"loss": 0.5373, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.5566452741622925, |
|
"learning_rate": 1.6940074906367042e-05, |
|
"loss": 0.4944, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 1.598917841911316, |
|
"learning_rate": 1.6865168539325845e-05, |
|
"loss": 0.5036, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 1.5281039476394653, |
|
"learning_rate": 1.6790262172284645e-05, |
|
"loss": 0.5215, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 1.7123130559921265, |
|
"learning_rate": 1.6715355805243448e-05, |
|
"loss": 0.5362, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 1.543447732925415, |
|
"learning_rate": 1.6640449438202248e-05, |
|
"loss": 0.5379, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 2.4190192222595215, |
|
"learning_rate": 1.656554307116105e-05, |
|
"loss": 0.4921, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 2.190906047821045, |
|
"learning_rate": 1.649063670411985e-05, |
|
"loss": 0.4652, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"grad_norm": 2.113476514816284, |
|
"learning_rate": 1.6415730337078653e-05, |
|
"loss": 0.4914, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 1.8785656690597534, |
|
"learning_rate": 1.6340823970037453e-05, |
|
"loss": 0.5135, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 1.3745977878570557, |
|
"learning_rate": 1.6265917602996256e-05, |
|
"loss": 0.4697, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 1.7874308824539185, |
|
"learning_rate": 1.6191011235955056e-05, |
|
"loss": 0.4625, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 1.4448940753936768, |
|
"learning_rate": 1.611610486891386e-05, |
|
"loss": 0.4764, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 2.278655767440796, |
|
"learning_rate": 1.6041198501872662e-05, |
|
"loss": 0.4221, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"grad_norm": 1.8602409362792969, |
|
"learning_rate": 1.596629213483146e-05, |
|
"loss": 0.4731, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 1.884373426437378, |
|
"learning_rate": 1.5891385767790265e-05, |
|
"loss": 0.4241, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 2.0259287357330322, |
|
"learning_rate": 1.5816479400749064e-05, |
|
"loss": 0.4368, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.812462329864502, |
|
"learning_rate": 1.5741573033707867e-05, |
|
"loss": 0.442, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 1.934327483177185, |
|
"learning_rate": 1.5666666666666667e-05, |
|
"loss": 0.4195, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 1.6152955293655396, |
|
"learning_rate": 1.559176029962547e-05, |
|
"loss": 0.4374, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 2.7782068252563477, |
|
"learning_rate": 1.551685393258427e-05, |
|
"loss": 0.4231, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"grad_norm": 2.372976303100586, |
|
"learning_rate": 1.5441947565543073e-05, |
|
"loss": 0.444, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 2.171353816986084, |
|
"learning_rate": 1.5367041198501872e-05, |
|
"loss": 0.4389, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 1.3093984127044678, |
|
"learning_rate": 1.5292134831460675e-05, |
|
"loss": 0.4301, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"grad_norm": 2.267932176589966, |
|
"learning_rate": 1.5217228464419478e-05, |
|
"loss": 0.4046, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 1.5326164960861206, |
|
"learning_rate": 1.514232209737828e-05, |
|
"loss": 0.4068, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"grad_norm": 3.1525979042053223, |
|
"learning_rate": 1.5067415730337081e-05, |
|
"loss": 0.3847, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 2.081890106201172, |
|
"learning_rate": 1.4992509363295882e-05, |
|
"loss": 0.4126, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"grad_norm": 2.5701358318328857, |
|
"learning_rate": 1.4917602996254684e-05, |
|
"loss": 0.4065, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 1.4190051555633545, |
|
"learning_rate": 1.4842696629213485e-05, |
|
"loss": 0.3979, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 1.9085837602615356, |
|
"learning_rate": 1.4767790262172286e-05, |
|
"loss": 0.3894, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"grad_norm": 1.7573003768920898, |
|
"learning_rate": 1.4692883895131088e-05, |
|
"loss": 0.3751, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 1.8974506855010986, |
|
"learning_rate": 1.4617977528089889e-05, |
|
"loss": 0.3936, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 1.3843660354614258, |
|
"learning_rate": 1.454307116104869e-05, |
|
"loss": 0.3848, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 1.525007724761963, |
|
"learning_rate": 1.4468164794007492e-05, |
|
"loss": 0.3552, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"grad_norm": 2.1665101051330566, |
|
"learning_rate": 1.4393258426966291e-05, |
|
"loss": 0.3547, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.53, |
|
"grad_norm": 3.3614535331726074, |
|
"learning_rate": 1.4318352059925096e-05, |
|
"loss": 0.3771, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 1.746299386024475, |
|
"learning_rate": 1.4243445692883898e-05, |
|
"loss": 0.396, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 1.9144684076309204, |
|
"learning_rate": 1.4168539325842699e-05, |
|
"loss": 0.3748, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 1.9617277383804321, |
|
"learning_rate": 1.40936329588015e-05, |
|
"loss": 0.3504, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 2.69067645072937, |
|
"learning_rate": 1.4018726591760302e-05, |
|
"loss": 0.3477, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 2.142008066177368, |
|
"learning_rate": 1.3943820224719103e-05, |
|
"loss": 0.3539, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 1.7684266567230225, |
|
"learning_rate": 1.3868913857677904e-05, |
|
"loss": 0.3576, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 1.4222275018692017, |
|
"learning_rate": 1.3794007490636706e-05, |
|
"loss": 0.3839, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 9.43, |
|
"grad_norm": 2.0622501373291016, |
|
"learning_rate": 1.3719101123595507e-05, |
|
"loss": 0.3278, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 9.54, |
|
"grad_norm": 1.639147400856018, |
|
"learning_rate": 1.3644194756554308e-05, |
|
"loss": 0.3374, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 2.093045473098755, |
|
"learning_rate": 1.356928838951311e-05, |
|
"loss": 0.3535, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 1.3492937088012695, |
|
"learning_rate": 1.3494382022471911e-05, |
|
"loss": 0.3105, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 1.585205316543579, |
|
"learning_rate": 1.3419475655430714e-05, |
|
"loss": 0.3181, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 9.99, |
|
"grad_norm": 2.8895344734191895, |
|
"learning_rate": 1.3344569288389515e-05, |
|
"loss": 0.3473, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 10.1, |
|
"grad_norm": 1.7224748134613037, |
|
"learning_rate": 1.3269662921348317e-05, |
|
"loss": 0.3524, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 10.21, |
|
"grad_norm": 2.1029868125915527, |
|
"learning_rate": 1.3194756554307118e-05, |
|
"loss": 0.3408, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"grad_norm": 2.434016227722168, |
|
"learning_rate": 1.311985018726592e-05, |
|
"loss": 0.3266, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"grad_norm": 1.953553318977356, |
|
"learning_rate": 1.304494382022472e-05, |
|
"loss": 0.2844, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 10.55, |
|
"grad_norm": 2.5946218967437744, |
|
"learning_rate": 1.2970037453183522e-05, |
|
"loss": 0.3225, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 10.66, |
|
"grad_norm": 2.5305733680725098, |
|
"learning_rate": 1.2895131086142323e-05, |
|
"loss": 0.3183, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 10.78, |
|
"grad_norm": 3.56726336479187, |
|
"learning_rate": 1.2820224719101125e-05, |
|
"loss": 0.2944, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 10.89, |
|
"grad_norm": 1.9687740802764893, |
|
"learning_rate": 1.2745318352059926e-05, |
|
"loss": 0.3411, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 1.6027730703353882, |
|
"learning_rate": 1.2670411985018727e-05, |
|
"loss": 0.2949, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 11.11, |
|
"grad_norm": 1.8739397525787354, |
|
"learning_rate": 1.2595505617977529e-05, |
|
"loss": 0.2716, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 11.22, |
|
"grad_norm": 1.6741198301315308, |
|
"learning_rate": 1.2520599250936332e-05, |
|
"loss": 0.3334, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 11.34, |
|
"grad_norm": 1.950945496559143, |
|
"learning_rate": 1.2445692883895133e-05, |
|
"loss": 0.3291, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"grad_norm": 1.9362170696258545, |
|
"learning_rate": 1.2370786516853935e-05, |
|
"loss": 0.2716, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 1.6201746463775635, |
|
"learning_rate": 1.2295880149812736e-05, |
|
"loss": 0.2893, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"grad_norm": 3.488088607788086, |
|
"learning_rate": 1.2220973782771537e-05, |
|
"loss": 0.3239, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 11.79, |
|
"grad_norm": 2.4608683586120605, |
|
"learning_rate": 1.2146067415730339e-05, |
|
"loss": 0.271, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 11.9, |
|
"grad_norm": 1.5321098566055298, |
|
"learning_rate": 1.207116104868914e-05, |
|
"loss": 0.2876, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 12.01, |
|
"grad_norm": 1.8334771394729614, |
|
"learning_rate": 1.1996254681647941e-05, |
|
"loss": 0.3066, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 12.12, |
|
"grad_norm": 1.9506254196166992, |
|
"learning_rate": 1.1921348314606743e-05, |
|
"loss": 0.3023, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 12.23, |
|
"grad_norm": 2.9073598384857178, |
|
"learning_rate": 1.1846441947565544e-05, |
|
"loss": 0.3152, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 12.35, |
|
"grad_norm": 1.6023261547088623, |
|
"learning_rate": 1.1771535580524345e-05, |
|
"loss": 0.248, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 12.46, |
|
"grad_norm": 1.7954633235931396, |
|
"learning_rate": 1.1696629213483147e-05, |
|
"loss": 0.2666, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 12.57, |
|
"grad_norm": 2.0331828594207764, |
|
"learning_rate": 1.162172284644195e-05, |
|
"loss": 0.2878, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 12.68, |
|
"grad_norm": 1.656420350074768, |
|
"learning_rate": 1.1546816479400751e-05, |
|
"loss": 0.2805, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 1.5245873928070068, |
|
"learning_rate": 1.1471910112359552e-05, |
|
"loss": 0.2792, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"grad_norm": 2.6713974475860596, |
|
"learning_rate": 1.1397003745318354e-05, |
|
"loss": 0.2841, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"grad_norm": 1.268479347229004, |
|
"learning_rate": 1.1322097378277155e-05, |
|
"loss": 0.2708, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 13.13, |
|
"grad_norm": 2.2990434169769287, |
|
"learning_rate": 1.1247191011235956e-05, |
|
"loss": 0.2649, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 13.24, |
|
"grad_norm": 2.351956367492676, |
|
"learning_rate": 1.1172284644194758e-05, |
|
"loss": 0.281, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 13.36, |
|
"grad_norm": 1.796783208847046, |
|
"learning_rate": 1.1097378277153559e-05, |
|
"loss": 0.2725, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 13.47, |
|
"grad_norm": 1.7035847902297974, |
|
"learning_rate": 1.102247191011236e-05, |
|
"loss": 0.2799, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 13.58, |
|
"grad_norm": 2.0395431518554688, |
|
"learning_rate": 1.0947565543071162e-05, |
|
"loss": 0.239, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 13.69, |
|
"grad_norm": 1.8008232116699219, |
|
"learning_rate": 1.0872659176029963e-05, |
|
"loss": 0.2553, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 13.81, |
|
"grad_norm": 2.0559043884277344, |
|
"learning_rate": 1.0797752808988765e-05, |
|
"loss": 0.2464, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 1.8673292398452759, |
|
"learning_rate": 1.0722846441947568e-05, |
|
"loss": 0.2699, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"grad_norm": 1.6819398403167725, |
|
"learning_rate": 1.0647940074906369e-05, |
|
"loss": 0.2566, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"grad_norm": 1.9703686237335205, |
|
"learning_rate": 1.057303370786517e-05, |
|
"loss": 0.2807, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 2.028834819793701, |
|
"learning_rate": 1.0498127340823972e-05, |
|
"loss": 0.2392, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 14.37, |
|
"grad_norm": 2.2455177307128906, |
|
"learning_rate": 1.0423220973782773e-05, |
|
"loss": 0.247, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 14.48, |
|
"grad_norm": 1.8078291416168213, |
|
"learning_rate": 1.0348314606741574e-05, |
|
"loss": 0.2552, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 14.59, |
|
"grad_norm": 2.166729211807251, |
|
"learning_rate": 1.0273408239700376e-05, |
|
"loss": 0.2466, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 14.7, |
|
"grad_norm": 2.710556745529175, |
|
"learning_rate": 1.0198501872659177e-05, |
|
"loss": 0.2506, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 14.82, |
|
"grad_norm": 2.1344659328460693, |
|
"learning_rate": 1.0123595505617978e-05, |
|
"loss": 0.2388, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 14.93, |
|
"grad_norm": 1.595842719078064, |
|
"learning_rate": 1.004868913857678e-05, |
|
"loss": 0.2553, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 15.04, |
|
"grad_norm": 1.5458731651306152, |
|
"learning_rate": 9.973782771535581e-06, |
|
"loss": 0.2478, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 15.15, |
|
"grad_norm": 1.9514356851577759, |
|
"learning_rate": 9.898876404494382e-06, |
|
"loss": 0.234, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 15.26, |
|
"grad_norm": 2.1551694869995117, |
|
"learning_rate": 9.823970037453184e-06, |
|
"loss": 0.251, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"grad_norm": 2.08258318901062, |
|
"learning_rate": 9.749063670411985e-06, |
|
"loss": 0.2511, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"grad_norm": 1.581690788269043, |
|
"learning_rate": 9.674157303370786e-06, |
|
"loss": 0.2185, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 2.2121975421905518, |
|
"learning_rate": 9.599250936329588e-06, |
|
"loss": 0.2161, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 15.71, |
|
"grad_norm": 1.5077215433120728, |
|
"learning_rate": 9.52434456928839e-06, |
|
"loss": 0.2308, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 15.83, |
|
"grad_norm": 2.57951021194458, |
|
"learning_rate": 9.449438202247192e-06, |
|
"loss": 0.2299, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 15.94, |
|
"grad_norm": 1.6634414196014404, |
|
"learning_rate": 9.374531835205993e-06, |
|
"loss": 0.2576, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 16.05, |
|
"grad_norm": 1.9692113399505615, |
|
"learning_rate": 9.299625468164795e-06, |
|
"loss": 0.2395, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 16.16, |
|
"grad_norm": 1.9327415227890015, |
|
"learning_rate": 9.224719101123596e-06, |
|
"loss": 0.241, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 16.27, |
|
"grad_norm": 1.7675727605819702, |
|
"learning_rate": 9.149812734082398e-06, |
|
"loss": 0.2201, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 16.39, |
|
"grad_norm": 1.9511345624923706, |
|
"learning_rate": 9.074906367041199e-06, |
|
"loss": 0.2171, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 1.7937383651733398, |
|
"learning_rate": 9e-06, |
|
"loss": 0.2286, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 16.61, |
|
"grad_norm": 1.79076087474823, |
|
"learning_rate": 8.925093632958802e-06, |
|
"loss": 0.2479, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"grad_norm": 2.4045145511627197, |
|
"learning_rate": 8.850187265917603e-06, |
|
"loss": 0.2153, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 16.84, |
|
"grad_norm": 2.1934499740600586, |
|
"learning_rate": 8.775280898876404e-06, |
|
"loss": 0.2361, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 5340, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"total_flos": 7.7973833613312e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|