|
{ |
|
"best_metric": 0.9866785079928952, |
|
"best_model_checkpoint": "cvt-13-finetuned-ibird/checkpoint-1696", |
|
"epoch": 4.995579133510168, |
|
"eval_steps": 500, |
|
"global_step": 2825, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.22133731842041, |
|
"learning_rate": 1.76678445229682e-06, |
|
"loss": 3.2244, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.790149688720703, |
|
"learning_rate": 3.53356890459364e-06, |
|
"loss": 3.2169, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.872779369354248, |
|
"learning_rate": 5.30035335689046e-06, |
|
"loss": 3.2236, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.464066982269287, |
|
"learning_rate": 7.06713780918728e-06, |
|
"loss": 3.2237, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.8840532302856445, |
|
"learning_rate": 8.8339222614841e-06, |
|
"loss": 3.2091, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.897418975830078, |
|
"learning_rate": 1.060070671378092e-05, |
|
"loss": 3.2041, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.91304874420166, |
|
"learning_rate": 1.236749116607774e-05, |
|
"loss": 3.1712, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.0000202655792236, |
|
"learning_rate": 1.413427561837456e-05, |
|
"loss": 3.1854, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.865732192993164, |
|
"learning_rate": 1.5901060070671377e-05, |
|
"loss": 3.1636, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.4034979343414307, |
|
"learning_rate": 1.76678445229682e-05, |
|
"loss": 3.128, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.288928747177124, |
|
"learning_rate": 1.9434628975265016e-05, |
|
"loss": 3.1292, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.1810078620910645, |
|
"learning_rate": 2.120141342756184e-05, |
|
"loss": 3.0734, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.9931440353393555, |
|
"learning_rate": 2.296819787985866e-05, |
|
"loss": 3.0704, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.1638436317443848, |
|
"learning_rate": 2.473498233215548e-05, |
|
"loss": 3.0107, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.532331705093384, |
|
"learning_rate": 2.6501766784452298e-05, |
|
"loss": 2.9914, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.235107421875, |
|
"learning_rate": 2.826855123674912e-05, |
|
"loss": 2.9416, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.254007339477539, |
|
"learning_rate": 3.003533568904594e-05, |
|
"loss": 2.9103, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.413905143737793, |
|
"learning_rate": 3.1802120141342755e-05, |
|
"loss": 2.8261, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.3824758529663086, |
|
"learning_rate": 3.356890459363958e-05, |
|
"loss": 2.7711, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.0212182998657227, |
|
"learning_rate": 3.53356890459364e-05, |
|
"loss": 2.7134, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.3175628185272217, |
|
"learning_rate": 3.710247349823322e-05, |
|
"loss": 2.5992, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.2799651622772217, |
|
"learning_rate": 3.886925795053003e-05, |
|
"loss": 2.565, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.323554039001465, |
|
"learning_rate": 4.063604240282686e-05, |
|
"loss": 2.4975, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.1259026527404785, |
|
"learning_rate": 4.240282685512368e-05, |
|
"loss": 2.3784, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.178640842437744, |
|
"learning_rate": 4.416961130742049e-05, |
|
"loss": 2.3291, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.33782696723938, |
|
"learning_rate": 4.593639575971732e-05, |
|
"loss": 2.2431, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.880281686782837, |
|
"learning_rate": 4.7703180212014135e-05, |
|
"loss": 2.135, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.968682050704956, |
|
"learning_rate": 4.946996466431096e-05, |
|
"loss": 2.0128, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.611078262329102, |
|
"learning_rate": 4.9862313139260423e-05, |
|
"loss": 2.0385, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.0588765144348145, |
|
"learning_rate": 4.9665617623918175e-05, |
|
"loss": 1.8919, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.524935722351074, |
|
"learning_rate": 4.9468922108575926e-05, |
|
"loss": 1.8712, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.9682562351226807, |
|
"learning_rate": 4.927222659323368e-05, |
|
"loss": 1.7009, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.5031826496124268, |
|
"learning_rate": 4.907553107789143e-05, |
|
"loss": 1.5617, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.6093549728393555, |
|
"learning_rate": 4.887883556254917e-05, |
|
"loss": 1.6847, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.248983383178711, |
|
"learning_rate": 4.8682140047206924e-05, |
|
"loss": 1.5254, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.7051937580108643, |
|
"learning_rate": 4.8485444531864675e-05, |
|
"loss": 1.4551, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.0025603771209717, |
|
"learning_rate": 4.8288749016522426e-05, |
|
"loss": 1.5533, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.4976930618286133, |
|
"learning_rate": 4.809205350118017e-05, |
|
"loss": 1.3265, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.950221061706543, |
|
"learning_rate": 4.789535798583792e-05, |
|
"loss": 1.2725, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.968952178955078, |
|
"learning_rate": 4.769866247049567e-05, |
|
"loss": 1.2879, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.3156518936157227, |
|
"learning_rate": 4.7501966955153424e-05, |
|
"loss": 1.1826, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.190901756286621, |
|
"learning_rate": 4.7305271439811175e-05, |
|
"loss": 1.1947, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.0758602619171143, |
|
"learning_rate": 4.7108575924468926e-05, |
|
"loss": 1.2037, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.5758004188537598, |
|
"learning_rate": 4.691188040912668e-05, |
|
"loss": 1.1067, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.4717776775360107, |
|
"learning_rate": 4.671518489378442e-05, |
|
"loss": 1.2233, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.4771013259887695, |
|
"learning_rate": 4.651848937844217e-05, |
|
"loss": 1.0819, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.409405469894409, |
|
"learning_rate": 4.6321793863099924e-05, |
|
"loss": 1.2001, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.6338610649108887, |
|
"learning_rate": 4.6125098347757675e-05, |
|
"loss": 1.0693, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.0745251178741455, |
|
"learning_rate": 4.5928402832415426e-05, |
|
"loss": 1.1109, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.047109842300415, |
|
"learning_rate": 4.573170731707318e-05, |
|
"loss": 1.0295, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.1728410720825195, |
|
"learning_rate": 4.553501180173092e-05, |
|
"loss": 1.0533, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.705878973007202, |
|
"learning_rate": 4.533831628638867e-05, |
|
"loss": 1.0001, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.719705820083618, |
|
"learning_rate": 4.5141620771046424e-05, |
|
"loss": 1.0657, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.654970169067383, |
|
"learning_rate": 4.4944925255704175e-05, |
|
"loss": 0.9326, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.275374889373779, |
|
"learning_rate": 4.4748229740361926e-05, |
|
"loss": 1.0006, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.071272373199463, |
|
"learning_rate": 4.455153422501967e-05, |
|
"loss": 1.0089, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9649200710479574, |
|
"eval_loss": 0.2877684533596039, |
|
"eval_runtime": 86.5125, |
|
"eval_samples_per_second": 26.031, |
|
"eval_steps_per_second": 3.26, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.3852458000183105, |
|
"learning_rate": 4.435483870967742e-05, |
|
"loss": 0.865, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.0667455196380615, |
|
"learning_rate": 4.415814319433517e-05, |
|
"loss": 0.9117, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.9478442668914795, |
|
"learning_rate": 4.3961447678992924e-05, |
|
"loss": 0.8391, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.9961579442024231, |
|
"learning_rate": 4.376475216365067e-05, |
|
"loss": 0.8528, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.6760354042053223, |
|
"learning_rate": 4.356805664830842e-05, |
|
"loss": 0.8387, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.119180917739868, |
|
"learning_rate": 4.337136113296617e-05, |
|
"loss": 0.7549, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.2463724613189697, |
|
"learning_rate": 4.317466561762392e-05, |
|
"loss": 0.9329, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.7437245845794678, |
|
"learning_rate": 4.297797010228167e-05, |
|
"loss": 0.824, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.5226683616638184, |
|
"learning_rate": 4.278127458693942e-05, |
|
"loss": 0.8805, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.30314302444458, |
|
"learning_rate": 4.258457907159717e-05, |
|
"loss": 0.9097, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.0539016723632812, |
|
"learning_rate": 4.238788355625492e-05, |
|
"loss": 0.8936, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.0018861293792725, |
|
"learning_rate": 4.219118804091267e-05, |
|
"loss": 0.9214, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.0286073684692383, |
|
"learning_rate": 4.1994492525570416e-05, |
|
"loss": 0.7991, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.8428072929382324, |
|
"learning_rate": 4.179779701022817e-05, |
|
"loss": 0.756, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.7304598093032837, |
|
"learning_rate": 4.160110149488592e-05, |
|
"loss": 0.7738, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 3.970334768295288, |
|
"learning_rate": 4.140440597954367e-05, |
|
"loss": 0.7426, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.3020598888397217, |
|
"learning_rate": 4.1207710464201413e-05, |
|
"loss": 0.7884, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.7566787004470825, |
|
"learning_rate": 4.1011014948859165e-05, |
|
"loss": 0.7569, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 3.269785165786743, |
|
"learning_rate": 4.0814319433516916e-05, |
|
"loss": 0.8191, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.191699504852295, |
|
"learning_rate": 4.061762391817467e-05, |
|
"loss": 0.8077, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.7478408813476562, |
|
"learning_rate": 4.042092840283242e-05, |
|
"loss": 0.7621, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.7008702754974365, |
|
"learning_rate": 4.022423288749016e-05, |
|
"loss": 0.81, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.9396083354949951, |
|
"learning_rate": 4.0027537372147914e-05, |
|
"loss": 0.7493, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.55366587638855, |
|
"learning_rate": 3.9830841856805665e-05, |
|
"loss": 0.7471, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.6567083597183228, |
|
"learning_rate": 3.9634146341463416e-05, |
|
"loss": 0.9089, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 4.915744781494141, |
|
"learning_rate": 3.943745082612117e-05, |
|
"loss": 0.8211, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.2864570617675781, |
|
"learning_rate": 3.924075531077892e-05, |
|
"loss": 0.7662, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.4654157161712646, |
|
"learning_rate": 3.904405979543666e-05, |
|
"loss": 0.7942, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 5.769187927246094, |
|
"learning_rate": 3.8847364280094414e-05, |
|
"loss": 0.73, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.6875308156013489, |
|
"learning_rate": 3.8650668764752165e-05, |
|
"loss": 0.7394, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.2160513401031494, |
|
"learning_rate": 3.8453973249409916e-05, |
|
"loss": 0.846, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.4410216808319092, |
|
"learning_rate": 3.825727773406767e-05, |
|
"loss": 0.7366, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 4.212517261505127, |
|
"learning_rate": 3.806058221872542e-05, |
|
"loss": 0.8275, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.9933316111564636, |
|
"learning_rate": 3.786388670338317e-05, |
|
"loss": 0.7688, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.2545719146728516, |
|
"learning_rate": 3.7667191188040914e-05, |
|
"loss": 0.734, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.0783274173736572, |
|
"learning_rate": 3.7470495672698665e-05, |
|
"loss": 0.8099, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.6741780042648315, |
|
"learning_rate": 3.7273800157356416e-05, |
|
"loss": 0.9695, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.448066473007202, |
|
"learning_rate": 3.707710464201417e-05, |
|
"loss": 0.7506, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 9.674800872802734, |
|
"learning_rate": 3.688040912667191e-05, |
|
"loss": 0.6566, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 6.721800327301025, |
|
"learning_rate": 3.668371361132966e-05, |
|
"loss": 0.7075, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 5.588386058807373, |
|
"learning_rate": 3.6487018095987414e-05, |
|
"loss": 0.8246, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.4187864065170288, |
|
"learning_rate": 3.6290322580645165e-05, |
|
"loss": 0.7808, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.7722711563110352, |
|
"learning_rate": 3.6093627065302916e-05, |
|
"loss": 0.8957, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.5007630586624146, |
|
"learning_rate": 3.589693154996066e-05, |
|
"loss": 0.7536, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.7198464870452881, |
|
"learning_rate": 3.570023603461841e-05, |
|
"loss": 0.6727, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 6.015268802642822, |
|
"learning_rate": 3.550354051927616e-05, |
|
"loss": 0.737, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.5802467465400696, |
|
"learning_rate": 3.5306845003933914e-05, |
|
"loss": 0.6929, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 6.546850681304932, |
|
"learning_rate": 3.511014948859166e-05, |
|
"loss": 0.7529, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 7.255277156829834, |
|
"learning_rate": 3.491345397324941e-05, |
|
"loss": 0.8639, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.564946711063385, |
|
"learning_rate": 3.471675845790716e-05, |
|
"loss": 0.7234, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.2637860774993896, |
|
"learning_rate": 3.452006294256491e-05, |
|
"loss": 0.9388, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.095790147781372, |
|
"learning_rate": 3.432336742722266e-05, |
|
"loss": 0.7637, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 4.213192462921143, |
|
"learning_rate": 3.412667191188041e-05, |
|
"loss": 0.7857, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 9.215189933776855, |
|
"learning_rate": 3.392997639653816e-05, |
|
"loss": 0.6239, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.098184585571289, |
|
"learning_rate": 3.373328088119591e-05, |
|
"loss": 0.5597, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.9442644119262695, |
|
"learning_rate": 3.353658536585366e-05, |
|
"loss": 0.7885, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 6.179882526397705, |
|
"learning_rate": 3.3339889850511406e-05, |
|
"loss": 0.7855, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9791296625222025, |
|
"eval_loss": 0.10786169767379761, |
|
"eval_runtime": 86.2088, |
|
"eval_samples_per_second": 26.123, |
|
"eval_steps_per_second": 3.271, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.36537787318229675, |
|
"learning_rate": 3.314319433516916e-05, |
|
"loss": 0.8073, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.9752954840660095, |
|
"learning_rate": 3.294649881982691e-05, |
|
"loss": 0.586, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.6317528486251831, |
|
"learning_rate": 3.274980330448466e-05, |
|
"loss": 0.7136, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.4661918580532074, |
|
"learning_rate": 3.255310778914241e-05, |
|
"loss": 0.7598, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.6691815853118896, |
|
"learning_rate": 3.2356412273800155e-05, |
|
"loss": 0.9078, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.7598626613616943, |
|
"learning_rate": 3.2159716758457906e-05, |
|
"loss": 0.5849, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.4353080093860626, |
|
"learning_rate": 3.196302124311566e-05, |
|
"loss": 0.8519, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.5575789213180542, |
|
"learning_rate": 3.176632572777341e-05, |
|
"loss": 0.7124, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.028132677078247, |
|
"learning_rate": 3.156963021243116e-05, |
|
"loss": 0.8845, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.471712350845337, |
|
"learning_rate": 3.137293469708891e-05, |
|
"loss": 0.7683, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 3.2386505603790283, |
|
"learning_rate": 3.1176239181746655e-05, |
|
"loss": 0.6573, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.6279830932617188, |
|
"learning_rate": 3.0979543666404406e-05, |
|
"loss": 0.6261, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.3760743141174316, |
|
"learning_rate": 3.078284815106216e-05, |
|
"loss": 0.8395, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.6067408919334412, |
|
"learning_rate": 3.058615263571991e-05, |
|
"loss": 0.6631, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.8222625255584717, |
|
"learning_rate": 3.0389457120377656e-05, |
|
"loss": 0.7618, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.4705863296985626, |
|
"learning_rate": 3.0192761605035407e-05, |
|
"loss": 0.657, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.885718047618866, |
|
"learning_rate": 2.999606608969316e-05, |
|
"loss": 0.6618, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.5552624464035034, |
|
"learning_rate": 2.9799370574350903e-05, |
|
"loss": 0.7135, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.8557870388031006, |
|
"learning_rate": 2.9602675059008654e-05, |
|
"loss": 0.6492, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.226236343383789, |
|
"learning_rate": 2.9405979543666405e-05, |
|
"loss": 0.5645, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.4508123993873596, |
|
"learning_rate": 2.9209284028324156e-05, |
|
"loss": 0.653, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 5.647830963134766, |
|
"learning_rate": 2.9012588512981904e-05, |
|
"loss": 0.6795, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.6091474294662476, |
|
"learning_rate": 2.8815892997639655e-05, |
|
"loss": 0.7905, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.5606883764266968, |
|
"learning_rate": 2.8619197482297406e-05, |
|
"loss": 0.6878, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.7051727771759033, |
|
"learning_rate": 2.8422501966955157e-05, |
|
"loss": 0.6996, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.7471837401390076, |
|
"learning_rate": 2.822580645161291e-05, |
|
"loss": 0.706, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.322110176086426, |
|
"learning_rate": 2.8029110936270653e-05, |
|
"loss": 0.9161, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.7188151478767395, |
|
"learning_rate": 2.7832415420928404e-05, |
|
"loss": 0.6905, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 10.193795204162598, |
|
"learning_rate": 2.7635719905586155e-05, |
|
"loss": 0.7407, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.9392734169960022, |
|
"learning_rate": 2.7439024390243906e-05, |
|
"loss": 0.6731, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.5180323719978333, |
|
"learning_rate": 2.724232887490165e-05, |
|
"loss": 0.5429, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.4639306664466858, |
|
"learning_rate": 2.7045633359559402e-05, |
|
"loss": 0.7713, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 3.5896573066711426, |
|
"learning_rate": 2.6848937844217153e-05, |
|
"loss": 0.7773, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.7149112820625305, |
|
"learning_rate": 2.6652242328874904e-05, |
|
"loss": 0.6672, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.8536508679389954, |
|
"learning_rate": 2.645554681353265e-05, |
|
"loss": 0.7086, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.054091215133667, |
|
"learning_rate": 2.62588512981904e-05, |
|
"loss": 0.6519, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.39519792795181274, |
|
"learning_rate": 2.606215578284815e-05, |
|
"loss": 0.6434, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.4575929343700409, |
|
"learning_rate": 2.5865460267505902e-05, |
|
"loss": 0.6553, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.534916877746582, |
|
"learning_rate": 2.5668764752163653e-05, |
|
"loss": 0.7915, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.7122427821159363, |
|
"learning_rate": 2.54720692368214e-05, |
|
"loss": 0.6913, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.49569156765937805, |
|
"learning_rate": 2.5275373721479152e-05, |
|
"loss": 0.6085, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.466847687959671, |
|
"learning_rate": 2.5078678206136903e-05, |
|
"loss": 0.6222, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.34097394347190857, |
|
"learning_rate": 2.488198269079465e-05, |
|
"loss": 0.6883, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.601562261581421, |
|
"learning_rate": 2.4685287175452402e-05, |
|
"loss": 0.6904, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.5125943422317505, |
|
"learning_rate": 2.448859166011015e-05, |
|
"loss": 0.6484, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.4327985644340515, |
|
"learning_rate": 2.42918961447679e-05, |
|
"loss": 0.7579, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.4168800115585327, |
|
"learning_rate": 2.4095200629425652e-05, |
|
"loss": 0.7012, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 2.620213270187378, |
|
"learning_rate": 2.38985051140834e-05, |
|
"loss": 0.6107, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.47252315282821655, |
|
"learning_rate": 2.370180959874115e-05, |
|
"loss": 0.63, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 3.600597620010376, |
|
"learning_rate": 2.35051140833989e-05, |
|
"loss": 0.7014, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.9168177843093872, |
|
"learning_rate": 2.330841856805665e-05, |
|
"loss": 0.6946, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.111616611480713, |
|
"learning_rate": 2.3111723052714398e-05, |
|
"loss": 0.5557, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.4891892671585083, |
|
"learning_rate": 2.291502753737215e-05, |
|
"loss": 0.6718, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.4157081842422485, |
|
"learning_rate": 2.2718332022029897e-05, |
|
"loss": 0.7549, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.3661864399909973, |
|
"learning_rate": 2.2521636506687648e-05, |
|
"loss": 0.8072, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.6403448581695557, |
|
"learning_rate": 2.2324940991345396e-05, |
|
"loss": 0.6657, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9866785079928952, |
|
"eval_loss": 0.07058515399694443, |
|
"eval_runtime": 87.367, |
|
"eval_samples_per_second": 25.776, |
|
"eval_steps_per_second": 3.228, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.5366094708442688, |
|
"learning_rate": 2.2128245476003147e-05, |
|
"loss": 0.7239, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.5593940019607544, |
|
"learning_rate": 2.1931549960660898e-05, |
|
"loss": 0.8491, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.2935149371623993, |
|
"learning_rate": 2.1734854445318646e-05, |
|
"loss": 0.6358, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.3379581570625305, |
|
"learning_rate": 2.1538158929976397e-05, |
|
"loss": 0.6124, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.6903249621391296, |
|
"learning_rate": 2.134146341463415e-05, |
|
"loss": 0.7021, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.5648875832557678, |
|
"learning_rate": 2.11447678992919e-05, |
|
"loss": 0.7315, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 3.177462100982666, |
|
"learning_rate": 2.0948072383949647e-05, |
|
"loss": 0.7836, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 2.3751437664031982, |
|
"learning_rate": 2.07513768686074e-05, |
|
"loss": 0.5984, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.39180535078048706, |
|
"learning_rate": 2.0554681353265146e-05, |
|
"loss": 0.6462, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.7616192102432251, |
|
"learning_rate": 2.0357985837922897e-05, |
|
"loss": 0.648, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.7866030931472778, |
|
"learning_rate": 2.0161290322580645e-05, |
|
"loss": 0.6992, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.5894863605499268, |
|
"learning_rate": 1.9964594807238396e-05, |
|
"loss": 0.7294, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.33011573553085327, |
|
"learning_rate": 1.9767899291896147e-05, |
|
"loss": 0.6715, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.21015465259552, |
|
"learning_rate": 1.9571203776553895e-05, |
|
"loss": 0.6617, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.3521706461906433, |
|
"learning_rate": 1.9374508261211646e-05, |
|
"loss": 0.7361, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.986220359802246, |
|
"learning_rate": 1.9177812745869394e-05, |
|
"loss": 0.782, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.6578150391578674, |
|
"learning_rate": 1.8981117230527145e-05, |
|
"loss": 0.6532, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.5501819252967834, |
|
"learning_rate": 1.8784421715184893e-05, |
|
"loss": 0.5756, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.0883512496948242, |
|
"learning_rate": 1.8587726199842644e-05, |
|
"loss": 0.6532, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.49157479405403137, |
|
"learning_rate": 1.8391030684500392e-05, |
|
"loss": 0.6951, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.6763016581535339, |
|
"learning_rate": 1.8194335169158143e-05, |
|
"loss": 0.5232, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.0173105001449585, |
|
"learning_rate": 1.799763965381589e-05, |
|
"loss": 0.6729, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.49198317527770996, |
|
"learning_rate": 1.7800944138473642e-05, |
|
"loss": 0.6914, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.7755669355392456, |
|
"learning_rate": 1.7604248623131393e-05, |
|
"loss": 0.6728, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.4817189574241638, |
|
"learning_rate": 1.7407553107789144e-05, |
|
"loss": 0.7147, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 1.0511811971664429, |
|
"learning_rate": 1.7210857592446896e-05, |
|
"loss": 0.7447, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.4621807038784027, |
|
"learning_rate": 1.7014162077104643e-05, |
|
"loss": 0.7905, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.42619049549102783, |
|
"learning_rate": 1.6817466561762395e-05, |
|
"loss": 0.6953, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.4883289933204651, |
|
"learning_rate": 1.6620771046420142e-05, |
|
"loss": 0.7246, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 5.140800476074219, |
|
"learning_rate": 1.6424075531077893e-05, |
|
"loss": 0.677, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.7420614361763, |
|
"learning_rate": 1.622738001573564e-05, |
|
"loss": 0.6223, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.5618042945861816, |
|
"learning_rate": 1.6030684500393392e-05, |
|
"loss": 0.7706, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 8.696101188659668, |
|
"learning_rate": 1.583398898505114e-05, |
|
"loss": 0.5982, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.613085925579071, |
|
"learning_rate": 1.563729346970889e-05, |
|
"loss": 0.5607, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.37136101722717285, |
|
"learning_rate": 1.5440597954366642e-05, |
|
"loss": 0.7066, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.0370784997940063, |
|
"learning_rate": 1.524390243902439e-05, |
|
"loss": 0.6117, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 3.805783748626709, |
|
"learning_rate": 1.5047206923682141e-05, |
|
"loss": 0.7247, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.3479630947113037, |
|
"learning_rate": 1.485051140833989e-05, |
|
"loss": 0.7094, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.27747422456741333, |
|
"learning_rate": 1.465381589299764e-05, |
|
"loss": 0.6548, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.4813224971294403, |
|
"learning_rate": 1.445712037765539e-05, |
|
"loss": 0.5747, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 1.6526601314544678, |
|
"learning_rate": 1.4260424862313141e-05, |
|
"loss": 0.8374, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.6736083626747131, |
|
"learning_rate": 1.4063729346970889e-05, |
|
"loss": 0.7677, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.8762106895446777, |
|
"learning_rate": 1.386703383162864e-05, |
|
"loss": 0.5897, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.622199296951294, |
|
"learning_rate": 1.3670338316286388e-05, |
|
"loss": 0.7407, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.3692738115787506, |
|
"learning_rate": 1.3473642800944139e-05, |
|
"loss": 0.8474, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.5641698241233826, |
|
"learning_rate": 1.327694728560189e-05, |
|
"loss": 0.6674, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.7227357029914856, |
|
"learning_rate": 1.3080251770259638e-05, |
|
"loss": 0.6852, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.80736243724823, |
|
"learning_rate": 1.2883556254917389e-05, |
|
"loss": 0.7686, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.7187584042549133, |
|
"learning_rate": 1.2686860739575138e-05, |
|
"loss": 0.7859, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.6288447976112366, |
|
"learning_rate": 1.249016522423289e-05, |
|
"loss": 0.683, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.6657484769821167, |
|
"learning_rate": 1.2293469708890639e-05, |
|
"loss": 0.6807, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.3658302128314972, |
|
"learning_rate": 1.2096774193548388e-05, |
|
"loss": 0.6274, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.7069531083106995, |
|
"learning_rate": 1.1900078678206138e-05, |
|
"loss": 0.7722, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.2362523227930069, |
|
"learning_rate": 1.1703383162863887e-05, |
|
"loss": 0.7107, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.49751946330070496, |
|
"learning_rate": 1.1506687647521637e-05, |
|
"loss": 0.6972, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.4281612038612366, |
|
"learning_rate": 1.1309992132179386e-05, |
|
"loss": 0.6935, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.47943803668022156, |
|
"learning_rate": 1.1113296616837136e-05, |
|
"loss": 0.7475, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9866785079928952, |
|
"eval_loss": 0.06677553802728653, |
|
"eval_runtime": 87.1331, |
|
"eval_samples_per_second": 25.846, |
|
"eval_steps_per_second": 3.236, |
|
"step": 2262 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 1.7420241832733154, |
|
"learning_rate": 1.0916601101494885e-05, |
|
"loss": 0.7495, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.43095558881759644, |
|
"learning_rate": 1.0719905586152636e-05, |
|
"loss": 0.6875, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 4.584779739379883, |
|
"learning_rate": 1.0523210070810386e-05, |
|
"loss": 0.8217, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.3836701810359955, |
|
"learning_rate": 1.0326514555468137e-05, |
|
"loss": 0.5724, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.8410959243774414, |
|
"learning_rate": 1.0129819040125886e-05, |
|
"loss": 0.6068, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.2401944100856781, |
|
"learning_rate": 9.933123524783636e-06, |
|
"loss": 0.6172, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.9553325176239014, |
|
"learning_rate": 9.736428009441385e-06, |
|
"loss": 0.5384, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.7399584650993347, |
|
"learning_rate": 9.539732494099135e-06, |
|
"loss": 0.6562, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.6674773097038269, |
|
"learning_rate": 9.343036978756884e-06, |
|
"loss": 0.736, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.35315585136413574, |
|
"learning_rate": 9.146341463414634e-06, |
|
"loss": 0.6334, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.8621203303337097, |
|
"learning_rate": 8.949645948072383e-06, |
|
"loss": 0.7214, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 6.545629501342773, |
|
"learning_rate": 8.752950432730134e-06, |
|
"loss": 0.5391, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.5393751859664917, |
|
"learning_rate": 8.556254917387884e-06, |
|
"loss": 0.6851, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.8293077945709229, |
|
"learning_rate": 8.359559402045635e-06, |
|
"loss": 0.791, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.44837647676467896, |
|
"learning_rate": 8.162863886703385e-06, |
|
"loss": 0.5622, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.5060232281684875, |
|
"learning_rate": 7.966168371361134e-06, |
|
"loss": 0.7254, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.647249698638916, |
|
"learning_rate": 7.769472856018883e-06, |
|
"loss": 0.5765, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.7065703868865967, |
|
"learning_rate": 7.572777340676633e-06, |
|
"loss": 0.7337, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.6291216015815735, |
|
"learning_rate": 7.376081825334382e-06, |
|
"loss": 0.649, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.6098889708518982, |
|
"learning_rate": 7.179386309992133e-06, |
|
"loss": 0.6712, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.49595367908477783, |
|
"learning_rate": 6.982690794649882e-06, |
|
"loss": 0.5932, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.34834733605384827, |
|
"learning_rate": 6.785995279307632e-06, |
|
"loss": 0.5702, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.30979278683662415, |
|
"learning_rate": 6.589299763965381e-06, |
|
"loss": 0.6423, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.0691258907318115, |
|
"learning_rate": 6.392604248623131e-06, |
|
"loss": 0.6254, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.6674053072929382, |
|
"learning_rate": 6.195908733280882e-06, |
|
"loss": 0.6869, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.4127647280693054, |
|
"learning_rate": 5.999213217938631e-06, |
|
"loss": 0.6614, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 1.5234615802764893, |
|
"learning_rate": 5.802517702596381e-06, |
|
"loss": 0.716, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.3525974750518799, |
|
"learning_rate": 5.605822187254131e-06, |
|
"loss": 0.8085, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.9571001529693604, |
|
"learning_rate": 5.40912667191188e-06, |
|
"loss": 0.6391, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.5705153942108154, |
|
"learning_rate": 5.212431156569631e-06, |
|
"loss": 0.5403, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.6829048991203308, |
|
"learning_rate": 5.01573564122738e-06, |
|
"loss": 0.6538, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 1.4802197217941284, |
|
"learning_rate": 4.81904012588513e-06, |
|
"loss": 0.7467, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.3378098011016846, |
|
"learning_rate": 4.62234461054288e-06, |
|
"loss": 0.7882, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.3483523428440094, |
|
"learning_rate": 4.425649095200629e-06, |
|
"loss": 0.6253, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.7070222496986389, |
|
"learning_rate": 4.22895357985838e-06, |
|
"loss": 0.8082, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.2973116934299469, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.7245, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.6432918310165405, |
|
"learning_rate": 3.835562549173879e-06, |
|
"loss": 0.5764, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.8339099287986755, |
|
"learning_rate": 3.638867033831629e-06, |
|
"loss": 0.607, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.5163792371749878, |
|
"learning_rate": 3.442171518489379e-06, |
|
"loss": 0.6602, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.1758856773376465, |
|
"learning_rate": 3.2454760031471283e-06, |
|
"loss": 0.8349, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.5800232887268066, |
|
"learning_rate": 3.0487804878048782e-06, |
|
"loss": 0.6944, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.342181921005249, |
|
"learning_rate": 2.852084972462628e-06, |
|
"loss": 0.6855, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.5758516788482666, |
|
"learning_rate": 2.655389457120378e-06, |
|
"loss": 0.5884, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.42382463812828064, |
|
"learning_rate": 2.4586939417781275e-06, |
|
"loss": 0.7499, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.5452685952186584, |
|
"learning_rate": 2.2619984264358773e-06, |
|
"loss": 0.6471, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 1.8580130338668823, |
|
"learning_rate": 2.0653029110936272e-06, |
|
"loss": 0.6492, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 1.0084635019302368, |
|
"learning_rate": 1.868607395751377e-06, |
|
"loss": 0.5958, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.44602665305137634, |
|
"learning_rate": 1.6719118804091268e-06, |
|
"loss": 0.6962, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.45221227407455444, |
|
"learning_rate": 1.4752163650668765e-06, |
|
"loss": 0.6997, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.639831304550171, |
|
"learning_rate": 1.2785208497246264e-06, |
|
"loss": 0.6848, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.6076949238777161, |
|
"learning_rate": 1.0818253343823763e-06, |
|
"loss": 0.6033, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.46627792716026306, |
|
"learning_rate": 8.85129819040126e-07, |
|
"loss": 0.6447, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 1.5518214702606201, |
|
"learning_rate": 6.884343036978757e-07, |
|
"loss": 0.7177, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.4077186584472656, |
|
"learning_rate": 4.917387883556255e-07, |
|
"loss": 0.6174, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.6033698916435242, |
|
"learning_rate": 2.9504327301337533e-07, |
|
"loss": 0.5902, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.20630072057247162, |
|
"learning_rate": 9.834775767112511e-08, |
|
"loss": 0.6021, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9853463587921847, |
|
"eval_loss": 0.0637223869562149, |
|
"eval_runtime": 86.6358, |
|
"eval_samples_per_second": 25.994, |
|
"eval_steps_per_second": 3.255, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2825, |
|
"total_flos": 1.6016486531331686e+18, |
|
"train_loss": 0.9856665741236864, |
|
"train_runtime": 4486.694, |
|
"train_samples_per_second": 20.16, |
|
"train_steps_per_second": 0.63 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.6016486531331686e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|