{ "best_metric": 0.9866785079928952, "best_model_checkpoint": "cvt-13-finetuned-ibird/checkpoint-1696", "epoch": 4.995579133510168, "eval_steps": 500, "global_step": 2825, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 3.22133731842041, "learning_rate": 1.76678445229682e-06, "loss": 3.2244, "step": 10 }, { "epoch": 0.04, "grad_norm": 2.790149688720703, "learning_rate": 3.53356890459364e-06, "loss": 3.2169, "step": 20 }, { "epoch": 0.05, "grad_norm": 2.872779369354248, "learning_rate": 5.30035335689046e-06, "loss": 3.2236, "step": 30 }, { "epoch": 0.07, "grad_norm": 2.464066982269287, "learning_rate": 7.06713780918728e-06, "loss": 3.2237, "step": 40 }, { "epoch": 0.09, "grad_norm": 3.8840532302856445, "learning_rate": 8.8339222614841e-06, "loss": 3.2091, "step": 50 }, { "epoch": 0.11, "grad_norm": 2.897418975830078, "learning_rate": 1.060070671378092e-05, "loss": 3.2041, "step": 60 }, { "epoch": 0.12, "grad_norm": 2.91304874420166, "learning_rate": 1.236749116607774e-05, "loss": 3.1712, "step": 70 }, { "epoch": 0.14, "grad_norm": 3.0000202655792236, "learning_rate": 1.413427561837456e-05, "loss": 3.1854, "step": 80 }, { "epoch": 0.16, "grad_norm": 2.865732192993164, "learning_rate": 1.5901060070671377e-05, "loss": 3.1636, "step": 90 }, { "epoch": 0.18, "grad_norm": 3.4034979343414307, "learning_rate": 1.76678445229682e-05, "loss": 3.128, "step": 100 }, { "epoch": 0.19, "grad_norm": 3.288928747177124, "learning_rate": 1.9434628975265016e-05, "loss": 3.1292, "step": 110 }, { "epoch": 0.21, "grad_norm": 5.1810078620910645, "learning_rate": 2.120141342756184e-05, "loss": 3.0734, "step": 120 }, { "epoch": 0.23, "grad_norm": 2.9931440353393555, "learning_rate": 2.296819787985866e-05, "loss": 3.0704, "step": 130 }, { "epoch": 0.25, "grad_norm": 3.1638436317443848, "learning_rate": 2.473498233215548e-05, "loss": 3.0107, "step": 140 }, { "epoch": 0.27, "grad_norm": 2.532331705093384, "learning_rate": 2.6501766784452298e-05, "loss": 2.9914, "step": 150 }, { "epoch": 0.28, "grad_norm": 3.235107421875, "learning_rate": 2.826855123674912e-05, "loss": 2.9416, "step": 160 }, { "epoch": 0.3, "grad_norm": 3.254007339477539, "learning_rate": 3.003533568904594e-05, "loss": 2.9103, "step": 170 }, { "epoch": 0.32, "grad_norm": 3.413905143737793, "learning_rate": 3.1802120141342755e-05, "loss": 2.8261, "step": 180 }, { "epoch": 0.34, "grad_norm": 3.3824758529663086, "learning_rate": 3.356890459363958e-05, "loss": 2.7711, "step": 190 }, { "epoch": 0.35, "grad_norm": 3.0212182998657227, "learning_rate": 3.53356890459364e-05, "loss": 2.7134, "step": 200 }, { "epoch": 0.37, "grad_norm": 3.3175628185272217, "learning_rate": 3.710247349823322e-05, "loss": 2.5992, "step": 210 }, { "epoch": 0.39, "grad_norm": 3.2799651622772217, "learning_rate": 3.886925795053003e-05, "loss": 2.565, "step": 220 }, { "epoch": 0.41, "grad_norm": 5.323554039001465, "learning_rate": 4.063604240282686e-05, "loss": 2.4975, "step": 230 }, { "epoch": 0.42, "grad_norm": 3.1259026527404785, "learning_rate": 4.240282685512368e-05, "loss": 2.3784, "step": 240 }, { "epoch": 0.44, "grad_norm": 3.178640842437744, "learning_rate": 4.416961130742049e-05, "loss": 2.3291, "step": 250 }, { "epoch": 0.46, "grad_norm": 3.33782696723938, "learning_rate": 4.593639575971732e-05, "loss": 2.2431, "step": 260 }, { "epoch": 0.48, "grad_norm": 3.880281686782837, "learning_rate": 4.7703180212014135e-05, "loss": 2.135, "step": 270 }, { "epoch": 0.5, "grad_norm": 3.968682050704956, "learning_rate": 4.946996466431096e-05, "loss": 2.0128, "step": 280 }, { "epoch": 0.51, "grad_norm": 4.611078262329102, "learning_rate": 4.9862313139260423e-05, "loss": 2.0385, "step": 290 }, { "epoch": 0.53, "grad_norm": 3.0588765144348145, "learning_rate": 4.9665617623918175e-05, "loss": 1.8919, "step": 300 }, { "epoch": 0.55, "grad_norm": 3.524935722351074, "learning_rate": 4.9468922108575926e-05, "loss": 1.8712, "step": 310 }, { "epoch": 0.57, "grad_norm": 3.9682562351226807, "learning_rate": 4.927222659323368e-05, "loss": 1.7009, "step": 320 }, { "epoch": 0.58, "grad_norm": 3.5031826496124268, "learning_rate": 4.907553107789143e-05, "loss": 1.5617, "step": 330 }, { "epoch": 0.6, "grad_norm": 3.6093549728393555, "learning_rate": 4.887883556254917e-05, "loss": 1.6847, "step": 340 }, { "epoch": 0.62, "grad_norm": 4.248983383178711, "learning_rate": 4.8682140047206924e-05, "loss": 1.5254, "step": 350 }, { "epoch": 0.64, "grad_norm": 3.7051937580108643, "learning_rate": 4.8485444531864675e-05, "loss": 1.4551, "step": 360 }, { "epoch": 0.65, "grad_norm": 3.0025603771209717, "learning_rate": 4.8288749016522426e-05, "loss": 1.5533, "step": 370 }, { "epoch": 0.67, "grad_norm": 3.4976930618286133, "learning_rate": 4.809205350118017e-05, "loss": 1.3265, "step": 380 }, { "epoch": 0.69, "grad_norm": 3.950221061706543, "learning_rate": 4.789535798583792e-05, "loss": 1.2725, "step": 390 }, { "epoch": 0.71, "grad_norm": 4.968952178955078, "learning_rate": 4.769866247049567e-05, "loss": 1.2879, "step": 400 }, { "epoch": 0.73, "grad_norm": 3.3156518936157227, "learning_rate": 4.7501966955153424e-05, "loss": 1.1826, "step": 410 }, { "epoch": 0.74, "grad_norm": 3.190901756286621, "learning_rate": 4.7305271439811175e-05, "loss": 1.1947, "step": 420 }, { "epoch": 0.76, "grad_norm": 2.0758602619171143, "learning_rate": 4.7108575924468926e-05, "loss": 1.2037, "step": 430 }, { "epoch": 0.78, "grad_norm": 2.5758004188537598, "learning_rate": 4.691188040912668e-05, "loss": 1.1067, "step": 440 }, { "epoch": 0.8, "grad_norm": 3.4717776775360107, "learning_rate": 4.671518489378442e-05, "loss": 1.2233, "step": 450 }, { "epoch": 0.81, "grad_norm": 2.4771013259887695, "learning_rate": 4.651848937844217e-05, "loss": 1.0819, "step": 460 }, { "epoch": 0.83, "grad_norm": 2.409405469894409, "learning_rate": 4.6321793863099924e-05, "loss": 1.2001, "step": 470 }, { "epoch": 0.85, "grad_norm": 2.6338610649108887, "learning_rate": 4.6125098347757675e-05, "loss": 1.0693, "step": 480 }, { "epoch": 0.87, "grad_norm": 3.0745251178741455, "learning_rate": 4.5928402832415426e-05, "loss": 1.1109, "step": 490 }, { "epoch": 0.88, "grad_norm": 2.047109842300415, "learning_rate": 4.573170731707318e-05, "loss": 1.0295, "step": 500 }, { "epoch": 0.9, "grad_norm": 4.1728410720825195, "learning_rate": 4.553501180173092e-05, "loss": 1.0533, "step": 510 }, { "epoch": 0.92, "grad_norm": 3.705878973007202, "learning_rate": 4.533831628638867e-05, "loss": 1.0001, "step": 520 }, { "epoch": 0.94, "grad_norm": 2.719705820083618, "learning_rate": 4.5141620771046424e-05, "loss": 1.0657, "step": 530 }, { "epoch": 0.95, "grad_norm": 3.654970169067383, "learning_rate": 4.4944925255704175e-05, "loss": 0.9326, "step": 540 }, { "epoch": 0.97, "grad_norm": 5.275374889373779, "learning_rate": 4.4748229740361926e-05, "loss": 1.0006, "step": 550 }, { "epoch": 0.99, "grad_norm": 3.071272373199463, "learning_rate": 4.455153422501967e-05, "loss": 1.0089, "step": 560 }, { "epoch": 1.0, "eval_accuracy": 0.9649200710479574, "eval_loss": 0.2877684533596039, "eval_runtime": 86.5125, "eval_samples_per_second": 26.031, "eval_steps_per_second": 3.26, "step": 565 }, { "epoch": 1.01, "grad_norm": 3.3852458000183105, "learning_rate": 4.435483870967742e-05, "loss": 0.865, "step": 570 }, { "epoch": 1.03, "grad_norm": 2.0667455196380615, "learning_rate": 4.415814319433517e-05, "loss": 0.9117, "step": 580 }, { "epoch": 1.04, "grad_norm": 1.9478442668914795, "learning_rate": 4.3961447678992924e-05, "loss": 0.8391, "step": 590 }, { "epoch": 1.06, "grad_norm": 0.9961579442024231, "learning_rate": 4.376475216365067e-05, "loss": 0.8528, "step": 600 }, { "epoch": 1.08, "grad_norm": 3.6760354042053223, "learning_rate": 4.356805664830842e-05, "loss": 0.8387, "step": 610 }, { "epoch": 1.1, "grad_norm": 3.119180917739868, "learning_rate": 4.337136113296617e-05, "loss": 0.7549, "step": 620 }, { "epoch": 1.11, "grad_norm": 3.2463724613189697, "learning_rate": 4.317466561762392e-05, "loss": 0.9329, "step": 630 }, { "epoch": 1.13, "grad_norm": 1.7437245845794678, "learning_rate": 4.297797010228167e-05, "loss": 0.824, "step": 640 }, { "epoch": 1.15, "grad_norm": 2.5226683616638184, "learning_rate": 4.278127458693942e-05, "loss": 0.8805, "step": 650 }, { "epoch": 1.17, "grad_norm": 2.30314302444458, "learning_rate": 4.258457907159717e-05, "loss": 0.9097, "step": 660 }, { "epoch": 1.18, "grad_norm": 2.0539016723632812, "learning_rate": 4.238788355625492e-05, "loss": 0.8936, "step": 670 }, { "epoch": 1.2, "grad_norm": 3.0018861293792725, "learning_rate": 4.219118804091267e-05, "loss": 0.9214, "step": 680 }, { "epoch": 1.22, "grad_norm": 2.0286073684692383, "learning_rate": 4.1994492525570416e-05, "loss": 0.7991, "step": 690 }, { "epoch": 1.24, "grad_norm": 0.8428072929382324, "learning_rate": 4.179779701022817e-05, "loss": 0.756, "step": 700 }, { "epoch": 1.26, "grad_norm": 1.7304598093032837, "learning_rate": 4.160110149488592e-05, "loss": 0.7738, "step": 710 }, { "epoch": 1.27, "grad_norm": 3.970334768295288, "learning_rate": 4.140440597954367e-05, "loss": 0.7426, "step": 720 }, { "epoch": 1.29, "grad_norm": 2.3020598888397217, "learning_rate": 4.1207710464201413e-05, "loss": 0.7884, "step": 730 }, { "epoch": 1.31, "grad_norm": 1.7566787004470825, "learning_rate": 4.1011014948859165e-05, "loss": 0.7569, "step": 740 }, { "epoch": 1.33, "grad_norm": 3.269785165786743, "learning_rate": 4.0814319433516916e-05, "loss": 0.8191, "step": 750 }, { "epoch": 1.34, "grad_norm": 2.191699504852295, "learning_rate": 4.061762391817467e-05, "loss": 0.8077, "step": 760 }, { "epoch": 1.36, "grad_norm": 1.7478408813476562, "learning_rate": 4.042092840283242e-05, "loss": 0.7621, "step": 770 }, { "epoch": 1.38, "grad_norm": 2.7008702754974365, "learning_rate": 4.022423288749016e-05, "loss": 0.81, "step": 780 }, { "epoch": 1.4, "grad_norm": 1.9396083354949951, "learning_rate": 4.0027537372147914e-05, "loss": 0.7493, "step": 790 }, { "epoch": 1.41, "grad_norm": 2.55366587638855, "learning_rate": 3.9830841856805665e-05, "loss": 0.7471, "step": 800 }, { "epoch": 1.43, "grad_norm": 1.6567083597183228, "learning_rate": 3.9634146341463416e-05, "loss": 0.9089, "step": 810 }, { "epoch": 1.45, "grad_norm": 4.915744781494141, "learning_rate": 3.943745082612117e-05, "loss": 0.8211, "step": 820 }, { "epoch": 1.47, "grad_norm": 1.2864570617675781, "learning_rate": 3.924075531077892e-05, "loss": 0.7662, "step": 830 }, { "epoch": 1.49, "grad_norm": 1.4654157161712646, "learning_rate": 3.904405979543666e-05, "loss": 0.7942, "step": 840 }, { "epoch": 1.5, "grad_norm": 5.769187927246094, "learning_rate": 3.8847364280094414e-05, "loss": 0.73, "step": 850 }, { "epoch": 1.52, "grad_norm": 0.6875308156013489, "learning_rate": 3.8650668764752165e-05, "loss": 0.7394, "step": 860 }, { "epoch": 1.54, "grad_norm": 1.2160513401031494, "learning_rate": 3.8453973249409916e-05, "loss": 0.846, "step": 870 }, { "epoch": 1.56, "grad_norm": 1.4410216808319092, "learning_rate": 3.825727773406767e-05, "loss": 0.7366, "step": 880 }, { "epoch": 1.57, "grad_norm": 4.212517261505127, "learning_rate": 3.806058221872542e-05, "loss": 0.8275, "step": 890 }, { "epoch": 1.59, "grad_norm": 0.9933316111564636, "learning_rate": 3.786388670338317e-05, "loss": 0.7688, "step": 900 }, { "epoch": 1.61, "grad_norm": 2.2545719146728516, "learning_rate": 3.7667191188040914e-05, "loss": 0.734, "step": 910 }, { "epoch": 1.63, "grad_norm": 1.0783274173736572, "learning_rate": 3.7470495672698665e-05, "loss": 0.8099, "step": 920 }, { "epoch": 1.64, "grad_norm": 1.6741780042648315, "learning_rate": 3.7273800157356416e-05, "loss": 0.9695, "step": 930 }, { "epoch": 1.66, "grad_norm": 3.448066473007202, "learning_rate": 3.707710464201417e-05, "loss": 0.7506, "step": 940 }, { "epoch": 1.68, "grad_norm": 9.674800872802734, "learning_rate": 3.688040912667191e-05, "loss": 0.6566, "step": 950 }, { "epoch": 1.7, "grad_norm": 6.721800327301025, "learning_rate": 3.668371361132966e-05, "loss": 0.7075, "step": 960 }, { "epoch": 1.72, "grad_norm": 5.588386058807373, "learning_rate": 3.6487018095987414e-05, "loss": 0.8246, "step": 970 }, { "epoch": 1.73, "grad_norm": 1.4187864065170288, "learning_rate": 3.6290322580645165e-05, "loss": 0.7808, "step": 980 }, { "epoch": 1.75, "grad_norm": 0.7722711563110352, "learning_rate": 3.6093627065302916e-05, "loss": 0.8957, "step": 990 }, { "epoch": 1.77, "grad_norm": 1.5007630586624146, "learning_rate": 3.589693154996066e-05, "loss": 0.7536, "step": 1000 }, { "epoch": 1.79, "grad_norm": 0.7198464870452881, "learning_rate": 3.570023603461841e-05, "loss": 0.6727, "step": 1010 }, { "epoch": 1.8, "grad_norm": 6.015268802642822, "learning_rate": 3.550354051927616e-05, "loss": 0.737, "step": 1020 }, { "epoch": 1.82, "grad_norm": 0.5802467465400696, "learning_rate": 3.5306845003933914e-05, "loss": 0.6929, "step": 1030 }, { "epoch": 1.84, "grad_norm": 6.546850681304932, "learning_rate": 3.511014948859166e-05, "loss": 0.7529, "step": 1040 }, { "epoch": 1.86, "grad_norm": 7.255277156829834, "learning_rate": 3.491345397324941e-05, "loss": 0.8639, "step": 1050 }, { "epoch": 1.87, "grad_norm": 0.564946711063385, "learning_rate": 3.471675845790716e-05, "loss": 0.7234, "step": 1060 }, { "epoch": 1.89, "grad_norm": 1.2637860774993896, "learning_rate": 3.452006294256491e-05, "loss": 0.9388, "step": 1070 }, { "epoch": 1.91, "grad_norm": 1.095790147781372, "learning_rate": 3.432336742722266e-05, "loss": 0.7637, "step": 1080 }, { "epoch": 1.93, "grad_norm": 4.213192462921143, "learning_rate": 3.412667191188041e-05, "loss": 0.7857, "step": 1090 }, { "epoch": 1.95, "grad_norm": 9.215189933776855, "learning_rate": 3.392997639653816e-05, "loss": 0.6239, "step": 1100 }, { "epoch": 1.96, "grad_norm": 2.098184585571289, "learning_rate": 3.373328088119591e-05, "loss": 0.5597, "step": 1110 }, { "epoch": 1.98, "grad_norm": 1.9442644119262695, "learning_rate": 3.353658536585366e-05, "loss": 0.7885, "step": 1120 }, { "epoch": 2.0, "grad_norm": 6.179882526397705, "learning_rate": 3.3339889850511406e-05, "loss": 0.7855, "step": 1130 }, { "epoch": 2.0, "eval_accuracy": 0.9791296625222025, "eval_loss": 0.10786169767379761, "eval_runtime": 86.2088, "eval_samples_per_second": 26.123, "eval_steps_per_second": 3.271, "step": 1131 }, { "epoch": 2.02, "grad_norm": 0.36537787318229675, "learning_rate": 3.314319433516916e-05, "loss": 0.8073, "step": 1140 }, { "epoch": 2.03, "grad_norm": 0.9752954840660095, "learning_rate": 3.294649881982691e-05, "loss": 0.586, "step": 1150 }, { "epoch": 2.05, "grad_norm": 0.6317528486251831, "learning_rate": 3.274980330448466e-05, "loss": 0.7136, "step": 1160 }, { "epoch": 2.07, "grad_norm": 0.4661918580532074, "learning_rate": 3.255310778914241e-05, "loss": 0.7598, "step": 1170 }, { "epoch": 2.09, "grad_norm": 0.6691815853118896, "learning_rate": 3.2356412273800155e-05, "loss": 0.9078, "step": 1180 }, { "epoch": 2.1, "grad_norm": 2.7598626613616943, "learning_rate": 3.2159716758457906e-05, "loss": 0.5849, "step": 1190 }, { "epoch": 2.12, "grad_norm": 0.4353080093860626, "learning_rate": 3.196302124311566e-05, "loss": 0.8519, "step": 1200 }, { "epoch": 2.14, "grad_norm": 0.5575789213180542, "learning_rate": 3.176632572777341e-05, "loss": 0.7124, "step": 1210 }, { "epoch": 2.16, "grad_norm": 2.028132677078247, "learning_rate": 3.156963021243116e-05, "loss": 0.8845, "step": 1220 }, { "epoch": 2.18, "grad_norm": 2.471712350845337, "learning_rate": 3.137293469708891e-05, "loss": 0.7683, "step": 1230 }, { "epoch": 2.19, "grad_norm": 3.2386505603790283, "learning_rate": 3.1176239181746655e-05, "loss": 0.6573, "step": 1240 }, { "epoch": 2.21, "grad_norm": 0.6279830932617188, "learning_rate": 3.0979543666404406e-05, "loss": 0.6261, "step": 1250 }, { "epoch": 2.23, "grad_norm": 1.3760743141174316, "learning_rate": 3.078284815106216e-05, "loss": 0.8395, "step": 1260 }, { "epoch": 2.25, "grad_norm": 0.6067408919334412, "learning_rate": 3.058615263571991e-05, "loss": 0.6631, "step": 1270 }, { "epoch": 2.26, "grad_norm": 1.8222625255584717, "learning_rate": 3.0389457120377656e-05, "loss": 0.7618, "step": 1280 }, { "epoch": 2.28, "grad_norm": 0.4705863296985626, "learning_rate": 3.0192761605035407e-05, "loss": 0.657, "step": 1290 }, { "epoch": 2.3, "grad_norm": 0.885718047618866, "learning_rate": 2.999606608969316e-05, "loss": 0.6618, "step": 1300 }, { "epoch": 2.32, "grad_norm": 0.5552624464035034, "learning_rate": 2.9799370574350903e-05, "loss": 0.7135, "step": 1310 }, { "epoch": 2.33, "grad_norm": 0.8557870388031006, "learning_rate": 2.9602675059008654e-05, "loss": 0.6492, "step": 1320 }, { "epoch": 2.35, "grad_norm": 1.226236343383789, "learning_rate": 2.9405979543666405e-05, "loss": 0.5645, "step": 1330 }, { "epoch": 2.37, "grad_norm": 0.4508123993873596, "learning_rate": 2.9209284028324156e-05, "loss": 0.653, "step": 1340 }, { "epoch": 2.39, "grad_norm": 5.647830963134766, "learning_rate": 2.9012588512981904e-05, "loss": 0.6795, "step": 1350 }, { "epoch": 2.4, "grad_norm": 0.6091474294662476, "learning_rate": 2.8815892997639655e-05, "loss": 0.7905, "step": 1360 }, { "epoch": 2.42, "grad_norm": 0.5606883764266968, "learning_rate": 2.8619197482297406e-05, "loss": 0.6878, "step": 1370 }, { "epoch": 2.44, "grad_norm": 2.7051727771759033, "learning_rate": 2.8422501966955157e-05, "loss": 0.6996, "step": 1380 }, { "epoch": 2.46, "grad_norm": 0.7471837401390076, "learning_rate": 2.822580645161291e-05, "loss": 0.706, "step": 1390 }, { "epoch": 2.48, "grad_norm": 2.322110176086426, "learning_rate": 2.8029110936270653e-05, "loss": 0.9161, "step": 1400 }, { "epoch": 2.49, "grad_norm": 0.7188151478767395, "learning_rate": 2.7832415420928404e-05, "loss": 0.6905, "step": 1410 }, { "epoch": 2.51, "grad_norm": 10.193795204162598, "learning_rate": 2.7635719905586155e-05, "loss": 0.7407, "step": 1420 }, { "epoch": 2.53, "grad_norm": 0.9392734169960022, "learning_rate": 2.7439024390243906e-05, "loss": 0.6731, "step": 1430 }, { "epoch": 2.55, "grad_norm": 0.5180323719978333, "learning_rate": 2.724232887490165e-05, "loss": 0.5429, "step": 1440 }, { "epoch": 2.56, "grad_norm": 0.4639306664466858, "learning_rate": 2.7045633359559402e-05, "loss": 0.7713, "step": 1450 }, { "epoch": 2.58, "grad_norm": 3.5896573066711426, "learning_rate": 2.6848937844217153e-05, "loss": 0.7773, "step": 1460 }, { "epoch": 2.6, "grad_norm": 0.7149112820625305, "learning_rate": 2.6652242328874904e-05, "loss": 0.6672, "step": 1470 }, { "epoch": 2.62, "grad_norm": 0.8536508679389954, "learning_rate": 2.645554681353265e-05, "loss": 0.7086, "step": 1480 }, { "epoch": 2.63, "grad_norm": 1.054091215133667, "learning_rate": 2.62588512981904e-05, "loss": 0.6519, "step": 1490 }, { "epoch": 2.65, "grad_norm": 0.39519792795181274, "learning_rate": 2.606215578284815e-05, "loss": 0.6434, "step": 1500 }, { "epoch": 2.67, "grad_norm": 0.4575929343700409, "learning_rate": 2.5865460267505902e-05, "loss": 0.6553, "step": 1510 }, { "epoch": 2.69, "grad_norm": 0.534916877746582, "learning_rate": 2.5668764752163653e-05, "loss": 0.7915, "step": 1520 }, { "epoch": 2.71, "grad_norm": 0.7122427821159363, "learning_rate": 2.54720692368214e-05, "loss": 0.6913, "step": 1530 }, { "epoch": 2.72, "grad_norm": 0.49569156765937805, "learning_rate": 2.5275373721479152e-05, "loss": 0.6085, "step": 1540 }, { "epoch": 2.74, "grad_norm": 0.466847687959671, "learning_rate": 2.5078678206136903e-05, "loss": 0.6222, "step": 1550 }, { "epoch": 2.76, "grad_norm": 0.34097394347190857, "learning_rate": 2.488198269079465e-05, "loss": 0.6883, "step": 1560 }, { "epoch": 2.78, "grad_norm": 1.601562261581421, "learning_rate": 2.4685287175452402e-05, "loss": 0.6904, "step": 1570 }, { "epoch": 2.79, "grad_norm": 0.5125943422317505, "learning_rate": 2.448859166011015e-05, "loss": 0.6484, "step": 1580 }, { "epoch": 2.81, "grad_norm": 0.4327985644340515, "learning_rate": 2.42918961447679e-05, "loss": 0.7579, "step": 1590 }, { "epoch": 2.83, "grad_norm": 0.4168800115585327, "learning_rate": 2.4095200629425652e-05, "loss": 0.7012, "step": 1600 }, { "epoch": 2.85, "grad_norm": 2.620213270187378, "learning_rate": 2.38985051140834e-05, "loss": 0.6107, "step": 1610 }, { "epoch": 2.86, "grad_norm": 0.47252315282821655, "learning_rate": 2.370180959874115e-05, "loss": 0.63, "step": 1620 }, { "epoch": 2.88, "grad_norm": 3.600597620010376, "learning_rate": 2.35051140833989e-05, "loss": 0.7014, "step": 1630 }, { "epoch": 2.9, "grad_norm": 0.9168177843093872, "learning_rate": 2.330841856805665e-05, "loss": 0.6946, "step": 1640 }, { "epoch": 2.92, "grad_norm": 2.111616611480713, "learning_rate": 2.3111723052714398e-05, "loss": 0.5557, "step": 1650 }, { "epoch": 2.94, "grad_norm": 0.4891892671585083, "learning_rate": 2.291502753737215e-05, "loss": 0.6718, "step": 1660 }, { "epoch": 2.95, "grad_norm": 1.4157081842422485, "learning_rate": 2.2718332022029897e-05, "loss": 0.7549, "step": 1670 }, { "epoch": 2.97, "grad_norm": 0.3661864399909973, "learning_rate": 2.2521636506687648e-05, "loss": 0.8072, "step": 1680 }, { "epoch": 2.99, "grad_norm": 2.6403448581695557, "learning_rate": 2.2324940991345396e-05, "loss": 0.6657, "step": 1690 }, { "epoch": 3.0, "eval_accuracy": 0.9866785079928952, "eval_loss": 0.07058515399694443, "eval_runtime": 87.367, "eval_samples_per_second": 25.776, "eval_steps_per_second": 3.228, "step": 1696 }, { "epoch": 3.01, "grad_norm": 0.5366094708442688, "learning_rate": 2.2128245476003147e-05, "loss": 0.7239, "step": 1700 }, { "epoch": 3.02, "grad_norm": 0.5593940019607544, "learning_rate": 2.1931549960660898e-05, "loss": 0.8491, "step": 1710 }, { "epoch": 3.04, "grad_norm": 0.2935149371623993, "learning_rate": 2.1734854445318646e-05, "loss": 0.6358, "step": 1720 }, { "epoch": 3.06, "grad_norm": 0.3379581570625305, "learning_rate": 2.1538158929976397e-05, "loss": 0.6124, "step": 1730 }, { "epoch": 3.08, "grad_norm": 0.6903249621391296, "learning_rate": 2.134146341463415e-05, "loss": 0.7021, "step": 1740 }, { "epoch": 3.09, "grad_norm": 0.5648875832557678, "learning_rate": 2.11447678992919e-05, "loss": 0.7315, "step": 1750 }, { "epoch": 3.11, "grad_norm": 3.177462100982666, "learning_rate": 2.0948072383949647e-05, "loss": 0.7836, "step": 1760 }, { "epoch": 3.13, "grad_norm": 2.3751437664031982, "learning_rate": 2.07513768686074e-05, "loss": 0.5984, "step": 1770 }, { "epoch": 3.15, "grad_norm": 0.39180535078048706, "learning_rate": 2.0554681353265146e-05, "loss": 0.6462, "step": 1780 }, { "epoch": 3.17, "grad_norm": 0.7616192102432251, "learning_rate": 2.0357985837922897e-05, "loss": 0.648, "step": 1790 }, { "epoch": 3.18, "grad_norm": 0.7866030931472778, "learning_rate": 2.0161290322580645e-05, "loss": 0.6992, "step": 1800 }, { "epoch": 3.2, "grad_norm": 2.5894863605499268, "learning_rate": 1.9964594807238396e-05, "loss": 0.7294, "step": 1810 }, { "epoch": 3.22, "grad_norm": 0.33011573553085327, "learning_rate": 1.9767899291896147e-05, "loss": 0.6715, "step": 1820 }, { "epoch": 3.24, "grad_norm": 1.21015465259552, "learning_rate": 1.9571203776553895e-05, "loss": 0.6617, "step": 1830 }, { "epoch": 3.25, "grad_norm": 0.3521706461906433, "learning_rate": 1.9374508261211646e-05, "loss": 0.7361, "step": 1840 }, { "epoch": 3.27, "grad_norm": 1.986220359802246, "learning_rate": 1.9177812745869394e-05, "loss": 0.782, "step": 1850 }, { "epoch": 3.29, "grad_norm": 0.6578150391578674, "learning_rate": 1.8981117230527145e-05, "loss": 0.6532, "step": 1860 }, { "epoch": 3.31, "grad_norm": 0.5501819252967834, "learning_rate": 1.8784421715184893e-05, "loss": 0.5756, "step": 1870 }, { "epoch": 3.32, "grad_norm": 1.0883512496948242, "learning_rate": 1.8587726199842644e-05, "loss": 0.6532, "step": 1880 }, { "epoch": 3.34, "grad_norm": 0.49157479405403137, "learning_rate": 1.8391030684500392e-05, "loss": 0.6951, "step": 1890 }, { "epoch": 3.36, "grad_norm": 0.6763016581535339, "learning_rate": 1.8194335169158143e-05, "loss": 0.5232, "step": 1900 }, { "epoch": 3.38, "grad_norm": 1.0173105001449585, "learning_rate": 1.799763965381589e-05, "loss": 0.6729, "step": 1910 }, { "epoch": 3.4, "grad_norm": 0.49198317527770996, "learning_rate": 1.7800944138473642e-05, "loss": 0.6914, "step": 1920 }, { "epoch": 3.41, "grad_norm": 0.7755669355392456, "learning_rate": 1.7604248623131393e-05, "loss": 0.6728, "step": 1930 }, { "epoch": 3.43, "grad_norm": 0.4817189574241638, "learning_rate": 1.7407553107789144e-05, "loss": 0.7147, "step": 1940 }, { "epoch": 3.45, "grad_norm": 1.0511811971664429, "learning_rate": 1.7210857592446896e-05, "loss": 0.7447, "step": 1950 }, { "epoch": 3.47, "grad_norm": 0.4621807038784027, "learning_rate": 1.7014162077104643e-05, "loss": 0.7905, "step": 1960 }, { "epoch": 3.48, "grad_norm": 0.42619049549102783, "learning_rate": 1.6817466561762395e-05, "loss": 0.6953, "step": 1970 }, { "epoch": 3.5, "grad_norm": 0.4883289933204651, "learning_rate": 1.6620771046420142e-05, "loss": 0.7246, "step": 1980 }, { "epoch": 3.52, "grad_norm": 5.140800476074219, "learning_rate": 1.6424075531077893e-05, "loss": 0.677, "step": 1990 }, { "epoch": 3.54, "grad_norm": 0.7420614361763, "learning_rate": 1.622738001573564e-05, "loss": 0.6223, "step": 2000 }, { "epoch": 3.55, "grad_norm": 0.5618042945861816, "learning_rate": 1.6030684500393392e-05, "loss": 0.7706, "step": 2010 }, { "epoch": 3.57, "grad_norm": 8.696101188659668, "learning_rate": 1.583398898505114e-05, "loss": 0.5982, "step": 2020 }, { "epoch": 3.59, "grad_norm": 0.613085925579071, "learning_rate": 1.563729346970889e-05, "loss": 0.5607, "step": 2030 }, { "epoch": 3.61, "grad_norm": 0.37136101722717285, "learning_rate": 1.5440597954366642e-05, "loss": 0.7066, "step": 2040 }, { "epoch": 3.63, "grad_norm": 1.0370784997940063, "learning_rate": 1.524390243902439e-05, "loss": 0.6117, "step": 2050 }, { "epoch": 3.64, "grad_norm": 3.805783748626709, "learning_rate": 1.5047206923682141e-05, "loss": 0.7247, "step": 2060 }, { "epoch": 3.66, "grad_norm": 0.3479630947113037, "learning_rate": 1.485051140833989e-05, "loss": 0.7094, "step": 2070 }, { "epoch": 3.68, "grad_norm": 0.27747422456741333, "learning_rate": 1.465381589299764e-05, "loss": 0.6548, "step": 2080 }, { "epoch": 3.7, "grad_norm": 0.4813224971294403, "learning_rate": 1.445712037765539e-05, "loss": 0.5747, "step": 2090 }, { "epoch": 3.71, "grad_norm": 1.6526601314544678, "learning_rate": 1.4260424862313141e-05, "loss": 0.8374, "step": 2100 }, { "epoch": 3.73, "grad_norm": 0.6736083626747131, "learning_rate": 1.4063729346970889e-05, "loss": 0.7677, "step": 2110 }, { "epoch": 3.75, "grad_norm": 1.8762106895446777, "learning_rate": 1.386703383162864e-05, "loss": 0.5897, "step": 2120 }, { "epoch": 3.77, "grad_norm": 0.622199296951294, "learning_rate": 1.3670338316286388e-05, "loss": 0.7407, "step": 2130 }, { "epoch": 3.78, "grad_norm": 0.3692738115787506, "learning_rate": 1.3473642800944139e-05, "loss": 0.8474, "step": 2140 }, { "epoch": 3.8, "grad_norm": 0.5641698241233826, "learning_rate": 1.327694728560189e-05, "loss": 0.6674, "step": 2150 }, { "epoch": 3.82, "grad_norm": 0.7227357029914856, "learning_rate": 1.3080251770259638e-05, "loss": 0.6852, "step": 2160 }, { "epoch": 3.84, "grad_norm": 0.80736243724823, "learning_rate": 1.2883556254917389e-05, "loss": 0.7686, "step": 2170 }, { "epoch": 3.85, "grad_norm": 0.7187584042549133, "learning_rate": 1.2686860739575138e-05, "loss": 0.7859, "step": 2180 }, { "epoch": 3.87, "grad_norm": 0.6288447976112366, "learning_rate": 1.249016522423289e-05, "loss": 0.683, "step": 2190 }, { "epoch": 3.89, "grad_norm": 0.6657484769821167, "learning_rate": 1.2293469708890639e-05, "loss": 0.6807, "step": 2200 }, { "epoch": 3.91, "grad_norm": 0.3658302128314972, "learning_rate": 1.2096774193548388e-05, "loss": 0.6274, "step": 2210 }, { "epoch": 3.93, "grad_norm": 0.7069531083106995, "learning_rate": 1.1900078678206138e-05, "loss": 0.7722, "step": 2220 }, { "epoch": 3.94, "grad_norm": 0.2362523227930069, "learning_rate": 1.1703383162863887e-05, "loss": 0.7107, "step": 2230 }, { "epoch": 3.96, "grad_norm": 0.49751946330070496, "learning_rate": 1.1506687647521637e-05, "loss": 0.6972, "step": 2240 }, { "epoch": 3.98, "grad_norm": 0.4281612038612366, "learning_rate": 1.1309992132179386e-05, "loss": 0.6935, "step": 2250 }, { "epoch": 4.0, "grad_norm": 0.47943803668022156, "learning_rate": 1.1113296616837136e-05, "loss": 0.7475, "step": 2260 }, { "epoch": 4.0, "eval_accuracy": 0.9866785079928952, "eval_loss": 0.06677553802728653, "eval_runtime": 87.1331, "eval_samples_per_second": 25.846, "eval_steps_per_second": 3.236, "step": 2262 }, { "epoch": 4.01, "grad_norm": 1.7420241832733154, "learning_rate": 1.0916601101494885e-05, "loss": 0.7495, "step": 2270 }, { "epoch": 4.03, "grad_norm": 0.43095558881759644, "learning_rate": 1.0719905586152636e-05, "loss": 0.6875, "step": 2280 }, { "epoch": 4.05, "grad_norm": 4.584779739379883, "learning_rate": 1.0523210070810386e-05, "loss": 0.8217, "step": 2290 }, { "epoch": 4.07, "grad_norm": 0.3836701810359955, "learning_rate": 1.0326514555468137e-05, "loss": 0.5724, "step": 2300 }, { "epoch": 4.08, "grad_norm": 0.8410959243774414, "learning_rate": 1.0129819040125886e-05, "loss": 0.6068, "step": 2310 }, { "epoch": 4.1, "grad_norm": 0.2401944100856781, "learning_rate": 9.933123524783636e-06, "loss": 0.6172, "step": 2320 }, { "epoch": 4.12, "grad_norm": 1.9553325176239014, "learning_rate": 9.736428009441385e-06, "loss": 0.5384, "step": 2330 }, { "epoch": 4.14, "grad_norm": 0.7399584650993347, "learning_rate": 9.539732494099135e-06, "loss": 0.6562, "step": 2340 }, { "epoch": 4.16, "grad_norm": 0.6674773097038269, "learning_rate": 9.343036978756884e-06, "loss": 0.736, "step": 2350 }, { "epoch": 4.17, "grad_norm": 0.35315585136413574, "learning_rate": 9.146341463414634e-06, "loss": 0.6334, "step": 2360 }, { "epoch": 4.19, "grad_norm": 0.8621203303337097, "learning_rate": 8.949645948072383e-06, "loss": 0.7214, "step": 2370 }, { "epoch": 4.21, "grad_norm": 6.545629501342773, "learning_rate": 8.752950432730134e-06, "loss": 0.5391, "step": 2380 }, { "epoch": 4.23, "grad_norm": 0.5393751859664917, "learning_rate": 8.556254917387884e-06, "loss": 0.6851, "step": 2390 }, { "epoch": 4.24, "grad_norm": 0.8293077945709229, "learning_rate": 8.359559402045635e-06, "loss": 0.791, "step": 2400 }, { "epoch": 4.26, "grad_norm": 0.44837647676467896, "learning_rate": 8.162863886703385e-06, "loss": 0.5622, "step": 2410 }, { "epoch": 4.28, "grad_norm": 0.5060232281684875, "learning_rate": 7.966168371361134e-06, "loss": 0.7254, "step": 2420 }, { "epoch": 4.3, "grad_norm": 0.647249698638916, "learning_rate": 7.769472856018883e-06, "loss": 0.5765, "step": 2430 }, { "epoch": 4.31, "grad_norm": 0.7065703868865967, "learning_rate": 7.572777340676633e-06, "loss": 0.7337, "step": 2440 }, { "epoch": 4.33, "grad_norm": 0.6291216015815735, "learning_rate": 7.376081825334382e-06, "loss": 0.649, "step": 2450 }, { "epoch": 4.35, "grad_norm": 0.6098889708518982, "learning_rate": 7.179386309992133e-06, "loss": 0.6712, "step": 2460 }, { "epoch": 4.37, "grad_norm": 0.49595367908477783, "learning_rate": 6.982690794649882e-06, "loss": 0.5932, "step": 2470 }, { "epoch": 4.39, "grad_norm": 0.34834733605384827, "learning_rate": 6.785995279307632e-06, "loss": 0.5702, "step": 2480 }, { "epoch": 4.4, "grad_norm": 0.30979278683662415, "learning_rate": 6.589299763965381e-06, "loss": 0.6423, "step": 2490 }, { "epoch": 4.42, "grad_norm": 1.0691258907318115, "learning_rate": 6.392604248623131e-06, "loss": 0.6254, "step": 2500 }, { "epoch": 4.44, "grad_norm": 0.6674053072929382, "learning_rate": 6.195908733280882e-06, "loss": 0.6869, "step": 2510 }, { "epoch": 4.46, "grad_norm": 0.4127647280693054, "learning_rate": 5.999213217938631e-06, "loss": 0.6614, "step": 2520 }, { "epoch": 4.47, "grad_norm": 1.5234615802764893, "learning_rate": 5.802517702596381e-06, "loss": 0.716, "step": 2530 }, { "epoch": 4.49, "grad_norm": 0.3525974750518799, "learning_rate": 5.605822187254131e-06, "loss": 0.8085, "step": 2540 }, { "epoch": 4.51, "grad_norm": 0.9571001529693604, "learning_rate": 5.40912667191188e-06, "loss": 0.6391, "step": 2550 }, { "epoch": 4.53, "grad_norm": 0.5705153942108154, "learning_rate": 5.212431156569631e-06, "loss": 0.5403, "step": 2560 }, { "epoch": 4.54, "grad_norm": 0.6829048991203308, "learning_rate": 5.01573564122738e-06, "loss": 0.6538, "step": 2570 }, { "epoch": 4.56, "grad_norm": 1.4802197217941284, "learning_rate": 4.81904012588513e-06, "loss": 0.7467, "step": 2580 }, { "epoch": 4.58, "grad_norm": 1.3378098011016846, "learning_rate": 4.62234461054288e-06, "loss": 0.7882, "step": 2590 }, { "epoch": 4.6, "grad_norm": 0.3483523428440094, "learning_rate": 4.425649095200629e-06, "loss": 0.6253, "step": 2600 }, { "epoch": 4.62, "grad_norm": 0.7070222496986389, "learning_rate": 4.22895357985838e-06, "loss": 0.8082, "step": 2610 }, { "epoch": 4.63, "grad_norm": 0.2973116934299469, "learning_rate": 4.032258064516129e-06, "loss": 0.7245, "step": 2620 }, { "epoch": 4.65, "grad_norm": 0.6432918310165405, "learning_rate": 3.835562549173879e-06, "loss": 0.5764, "step": 2630 }, { "epoch": 4.67, "grad_norm": 0.8339099287986755, "learning_rate": 3.638867033831629e-06, "loss": 0.607, "step": 2640 }, { "epoch": 4.69, "grad_norm": 0.5163792371749878, "learning_rate": 3.442171518489379e-06, "loss": 0.6602, "step": 2650 }, { "epoch": 4.7, "grad_norm": 1.1758856773376465, "learning_rate": 3.2454760031471283e-06, "loss": 0.8349, "step": 2660 }, { "epoch": 4.72, "grad_norm": 0.5800232887268066, "learning_rate": 3.0487804878048782e-06, "loss": 0.6944, "step": 2670 }, { "epoch": 4.74, "grad_norm": 0.342181921005249, "learning_rate": 2.852084972462628e-06, "loss": 0.6855, "step": 2680 }, { "epoch": 4.76, "grad_norm": 0.5758516788482666, "learning_rate": 2.655389457120378e-06, "loss": 0.5884, "step": 2690 }, { "epoch": 4.77, "grad_norm": 0.42382463812828064, "learning_rate": 2.4586939417781275e-06, "loss": 0.7499, "step": 2700 }, { "epoch": 4.79, "grad_norm": 0.5452685952186584, "learning_rate": 2.2619984264358773e-06, "loss": 0.6471, "step": 2710 }, { "epoch": 4.81, "grad_norm": 1.8580130338668823, "learning_rate": 2.0653029110936272e-06, "loss": 0.6492, "step": 2720 }, { "epoch": 4.83, "grad_norm": 1.0084635019302368, "learning_rate": 1.868607395751377e-06, "loss": 0.5958, "step": 2730 }, { "epoch": 4.85, "grad_norm": 0.44602665305137634, "learning_rate": 1.6719118804091268e-06, "loss": 0.6962, "step": 2740 }, { "epoch": 4.86, "grad_norm": 0.45221227407455444, "learning_rate": 1.4752163650668765e-06, "loss": 0.6997, "step": 2750 }, { "epoch": 4.88, "grad_norm": 1.639831304550171, "learning_rate": 1.2785208497246264e-06, "loss": 0.6848, "step": 2760 }, { "epoch": 4.9, "grad_norm": 0.6076949238777161, "learning_rate": 1.0818253343823763e-06, "loss": 0.6033, "step": 2770 }, { "epoch": 4.92, "grad_norm": 0.46627792716026306, "learning_rate": 8.85129819040126e-07, "loss": 0.6447, "step": 2780 }, { "epoch": 4.93, "grad_norm": 1.5518214702606201, "learning_rate": 6.884343036978757e-07, "loss": 0.7177, "step": 2790 }, { "epoch": 4.95, "grad_norm": 0.4077186584472656, "learning_rate": 4.917387883556255e-07, "loss": 0.6174, "step": 2800 }, { "epoch": 4.97, "grad_norm": 0.6033698916435242, "learning_rate": 2.9504327301337533e-07, "loss": 0.5902, "step": 2810 }, { "epoch": 4.99, "grad_norm": 0.20630072057247162, "learning_rate": 9.834775767112511e-08, "loss": 0.6021, "step": 2820 }, { "epoch": 5.0, "eval_accuracy": 0.9853463587921847, "eval_loss": 0.0637223869562149, "eval_runtime": 86.6358, "eval_samples_per_second": 25.994, "eval_steps_per_second": 3.255, "step": 2825 }, { "epoch": 5.0, "step": 2825, "total_flos": 1.6016486531331686e+18, "train_loss": 0.9856665741236864, "train_runtime": 4486.694, "train_samples_per_second": 20.16, "train_steps_per_second": 0.63 } ], "logging_steps": 10, "max_steps": 2825, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.6016486531331686e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }