diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6205 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.389467, + "global_step": 1000000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.9820000000000003e-06, + "loss": 82.5404, + "step": 1000 + }, + { + "epoch": 0.0, + "learning_rate": 3.98e-06, + "loss": 57.4563, + "step": 2000 + }, + { + "epoch": 0.0, + "learning_rate": 5.98e-06, + "loss": 41.077, + "step": 3000 + }, + { + "epoch": 0.0, + "learning_rate": 7.978e-06, + "loss": 30.4085, + "step": 4000 + }, + { + "epoch": 0.01, + "learning_rate": 9.976e-06, + "loss": 23.8433, + "step": 5000 + }, + { + "epoch": 0.01, + "learning_rate": 1.1976000000000002e-05, + "loss": 18.0328, + "step": 6000 + }, + { + "epoch": 0.01, + "learning_rate": 1.3976e-05, + "loss": 14.5402, + "step": 7000 + }, + { + "epoch": 0.01, + "learning_rate": 1.5976e-05, + "loss": 12.3042, + "step": 8000 + }, + { + "epoch": 0.01, + "learning_rate": 1.7976000000000002e-05, + "loss": 11.2017, + "step": 9000 + }, + { + "epoch": 0.01, + "learning_rate": 1.9974000000000002e-05, + "loss": 10.2353, + "step": 10000 + }, + { + "epoch": 0.01, + "learning_rate": 2.1974e-05, + "loss": 9.5083, + "step": 11000 + }, + { + "epoch": 0.01, + "learning_rate": 2.3974e-05, + "loss": 9.0682, + "step": 12000 + }, + { + "epoch": 0.01, + "learning_rate": 2.5974000000000002e-05, + "loss": 8.6615, + "step": 13000 + }, + { + "epoch": 0.01, + "learning_rate": 2.7974e-05, + "loss": 8.4494, + "step": 14000 + }, + { + "epoch": 0.01, + "learning_rate": 2.9974e-05, + "loss": 8.1137, + "step": 15000 + }, + { + "epoch": 0.02, + "learning_rate": 3.1974e-05, + "loss": 7.8827, + "step": 16000 + }, + { + "epoch": 0.02, + "learning_rate": 3.3974e-05, + "loss": 7.6477, + "step": 17000 + }, + { + "epoch": 0.02, + "learning_rate": 3.5972e-05, + "loss": 7.4429, + "step": 18000 + }, + { + "epoch": 0.02, + "learning_rate": 3.7972e-05, + "loss": 7.5768, + "step": 19000 + }, + { + "epoch": 0.02, + "learning_rate": 3.9972e-05, + "loss": 7.467, + "step": 20000 + }, + { + "epoch": 0.02, + "learning_rate": 4.1972e-05, + "loss": 7.4095, + "step": 21000 + }, + { + "epoch": 0.02, + "learning_rate": 4.3972000000000005e-05, + "loss": 7.4893, + "step": 22000 + }, + { + "epoch": 0.02, + "learning_rate": 4.597e-05, + "loss": 7.426, + "step": 23000 + }, + { + "epoch": 0.02, + "learning_rate": 4.7968e-05, + "loss": 7.2462, + "step": 24000 + }, + { + "epoch": 0.03, + "learning_rate": 4.996800000000001e-05, + "loss": 7.2831, + "step": 25000 + }, + { + "epoch": 0.03, + "learning_rate": 5.196800000000001e-05, + "loss": 7.0669, + "step": 26000 + }, + { + "epoch": 0.03, + "learning_rate": 5.396800000000001e-05, + "loss": 6.8339, + "step": 27000 + }, + { + "epoch": 0.03, + "learning_rate": 5.5968e-05, + "loss": 6.7626, + "step": 28000 + }, + { + "epoch": 0.03, + "learning_rate": 5.7968e-05, + "loss": 6.8304, + "step": 29000 + }, + { + "epoch": 0.03, + "learning_rate": 5.9968e-05, + "loss": 6.2205, + "step": 30000 + }, + { + "epoch": 0.03, + "learning_rate": 6.1968e-05, + "loss": 5.5114, + "step": 31000 + }, + { + "epoch": 0.03, + "learning_rate": 6.396800000000001e-05, + "loss": 4.9834, + "step": 32000 + }, + { + "epoch": 0.03, + "learning_rate": 6.5968e-05, + "loss": 4.4191, + "step": 33000 + }, + { + "epoch": 0.03, + "learning_rate": 6.7968e-05, + "loss": 3.2657, + "step": 34000 + }, + { + "epoch": 0.04, + "learning_rate": 6.9968e-05, + "loss": 3.1636, + "step": 35000 + }, + { + "epoch": 0.04, + "learning_rate": 7.1968e-05, + "loss": 2.9531, + "step": 36000 + }, + { + "epoch": 0.04, + "learning_rate": 7.3966e-05, + "loss": 2.8223, + "step": 37000 + }, + { + "epoch": 0.04, + "learning_rate": 7.5966e-05, + "loss": 2.8884, + "step": 38000 + }, + { + "epoch": 0.04, + "learning_rate": 7.7966e-05, + "loss": 2.9467, + "step": 39000 + }, + { + "epoch": 0.04, + "learning_rate": 7.996600000000001e-05, + "loss": 2.6816, + "step": 40000 + }, + { + "epoch": 0.04, + "learning_rate": 8.196600000000001e-05, + "loss": 2.4951, + "step": 41000 + }, + { + "epoch": 0.04, + "learning_rate": 8.3966e-05, + "loss": 2.3872, + "step": 42000 + }, + { + "epoch": 0.04, + "learning_rate": 8.5966e-05, + "loss": 2.382, + "step": 43000 + }, + { + "epoch": 0.04, + "learning_rate": 8.7966e-05, + "loss": 2.0369, + "step": 44000 + }, + { + "epoch": 0.04, + "learning_rate": 8.996600000000001e-05, + "loss": 2.0121, + "step": 45000 + }, + { + "epoch": 0.05, + "learning_rate": 9.196600000000001e-05, + "loss": 1.8462, + "step": 46000 + }, + { + "epoch": 0.05, + "learning_rate": 9.396600000000001e-05, + "loss": 1.8439, + "step": 47000 + }, + { + "epoch": 0.05, + "learning_rate": 9.5964e-05, + "loss": 2.1532, + "step": 48000 + }, + { + "epoch": 0.05, + "learning_rate": 9.7964e-05, + "loss": 2.0483, + "step": 49000 + }, + { + "epoch": 0.05, + "learning_rate": 9.996400000000001e-05, + "loss": 2.1008, + "step": 50000 + }, + { + "epoch": 0.05, + "eval_accuracy": 0.6533359021520881, + "eval_loss": 2.052328586578369, + "eval_runtime": 12.2044, + "eval_samples_per_second": 409.687, + "eval_steps_per_second": 0.819, + "step": 50000 + }, + { + "epoch": 0.05, + "learning_rate": 9.99997363576723e-05, + "loss": 2.0612, + "step": 51000 + }, + { + "epoch": 0.05, + "learning_rate": 9.999892709828038e-05, + "loss": 2.0449, + "step": 52000 + }, + { + "epoch": 0.05, + "learning_rate": 9.999757052195638e-05, + "loss": 1.8157, + "step": 53000 + }, + { + "epoch": 0.05, + "learning_rate": 9.999566718021146e-05, + "loss": 1.6483, + "step": 54000 + }, + { + "epoch": 0.06, + "learning_rate": 9.999322253966728e-05, + "loss": 1.7841, + "step": 55000 + }, + { + "epoch": 0.06, + "learning_rate": 9.999022682890849e-05, + "loss": 1.7667, + "step": 56000 + }, + { + "epoch": 0.06, + "learning_rate": 9.998668443303824e-05, + "loss": 1.7684, + "step": 57000 + }, + { + "epoch": 0.06, + "learning_rate": 9.998259539079562e-05, + "loss": 1.9884, + "step": 58000 + }, + { + "epoch": 0.06, + "learning_rate": 9.997796465555275e-05, + "loss": 1.9409, + "step": 59000 + }, + { + "epoch": 0.06, + "learning_rate": 9.99727884618501e-05, + "loss": 1.9498, + "step": 60000 + }, + { + "epoch": 0.06, + "learning_rate": 9.996706086563065e-05, + "loss": 1.961, + "step": 61000 + }, + { + "epoch": 0.06, + "learning_rate": 9.996079338458734e-05, + "loss": 1.9519, + "step": 62000 + }, + { + "epoch": 0.06, + "learning_rate": 9.995397353976218e-05, + "loss": 1.6914, + "step": 63000 + }, + { + "epoch": 0.06, + "learning_rate": 9.994660740628572e-05, + "loss": 1.7069, + "step": 64000 + }, + { + "epoch": 0.07, + "learning_rate": 9.993869506471283e-05, + "loss": 1.6265, + "step": 65000 + }, + { + "epoch": 0.07, + "learning_rate": 9.993023660157159e-05, + "loss": 1.5434, + "step": 66000 + }, + { + "epoch": 0.07, + "learning_rate": 9.992123210936239e-05, + "loss": 1.5563, + "step": 67000 + }, + { + "epoch": 0.07, + "learning_rate": 9.991168168655695e-05, + "loss": 1.7248, + "step": 68000 + }, + { + "epoch": 0.07, + "learning_rate": 9.990158543759713e-05, + "loss": 1.7096, + "step": 69000 + }, + { + "epoch": 0.07, + "learning_rate": 9.98909543874054e-05, + "loss": 1.7824, + "step": 70000 + }, + { + "epoch": 0.07, + "learning_rate": 9.987976736887685e-05, + "loss": 1.7253, + "step": 71000 + }, + { + "epoch": 0.07, + "learning_rate": 9.98680468781231e-05, + "loss": 1.7548, + "step": 72000 + }, + { + "epoch": 0.07, + "learning_rate": 9.9855769578893e-05, + "loss": 1.418, + "step": 73000 + }, + { + "epoch": 0.07, + "learning_rate": 9.984294706495438e-05, + "loss": 1.3509, + "step": 74000 + }, + { + "epoch": 0.07, + "learning_rate": 9.982957947653222e-05, + "loss": 1.2693, + "step": 75000 + }, + { + "epoch": 0.08, + "learning_rate": 9.981568114447057e-05, + "loss": 1.2466, + "step": 76000 + }, + { + "epoch": 0.08, + "learning_rate": 9.980123912510795e-05, + "loss": 1.4623, + "step": 77000 + }, + { + "epoch": 0.08, + "learning_rate": 9.978623830325964e-05, + "loss": 1.5242, + "step": 78000 + }, + { + "epoch": 0.08, + "learning_rate": 9.977069302708569e-05, + "loss": 1.4762, + "step": 79000 + }, + { + "epoch": 0.08, + "learning_rate": 9.975461982795923e-05, + "loss": 1.6135, + "step": 80000 + }, + { + "epoch": 0.08, + "learning_rate": 9.97379867031067e-05, + "loss": 1.6171, + "step": 81000 + }, + { + "epoch": 0.08, + "learning_rate": 9.972080965160031e-05, + "loss": 1.5516, + "step": 82000 + }, + { + "epoch": 0.08, + "learning_rate": 9.970308886128555e-05, + "loss": 1.3369, + "step": 83000 + }, + { + "epoch": 0.08, + "learning_rate": 9.968488013163436e-05, + "loss": 1.3425, + "step": 84000 + }, + { + "epoch": 0.09, + "learning_rate": 9.966607408075336e-05, + "loss": 1.2589, + "step": 85000 + }, + { + "epoch": 0.09, + "learning_rate": 9.964672488964346e-05, + "loss": 1.1773, + "step": 86000 + }, + { + "epoch": 0.09, + "learning_rate": 9.962683276990424e-05, + "loss": 1.4146, + "step": 87000 + }, + { + "epoch": 0.09, + "learning_rate": 9.960639793907279e-05, + "loss": 1.5667, + "step": 88000 + }, + { + "epoch": 0.09, + "learning_rate": 9.958544186883626e-05, + "loss": 1.6244, + "step": 89000 + }, + { + "epoch": 0.09, + "learning_rate": 9.956392283431062e-05, + "loss": 1.6549, + "step": 90000 + }, + { + "epoch": 0.09, + "learning_rate": 9.954188410838409e-05, + "loss": 1.6582, + "step": 91000 + }, + { + "epoch": 0.09, + "learning_rate": 9.951928181053549e-05, + "loss": 1.5158, + "step": 92000 + }, + { + "epoch": 0.09, + "learning_rate": 9.949613797775468e-05, + "loss": 1.4381, + "step": 93000 + }, + { + "epoch": 0.09, + "learning_rate": 9.947245286313893e-05, + "loss": 1.3927, + "step": 94000 + }, + { + "epoch": 0.1, + "learning_rate": 9.944822672570486e-05, + "loss": 1.3224, + "step": 95000 + }, + { + "epoch": 0.1, + "learning_rate": 9.942348486729986e-05, + "loss": 1.3627, + "step": 96000 + }, + { + "epoch": 0.1, + "learning_rate": 9.939820360201583e-05, + "loss": 1.5653, + "step": 97000 + }, + { + "epoch": 0.1, + "learning_rate": 9.937235708951778e-05, + "loss": 1.5405, + "step": 98000 + }, + { + "epoch": 0.1, + "learning_rate": 9.934597064883276e-05, + "loss": 1.5795, + "step": 99000 + }, + { + "epoch": 0.1, + "learning_rate": 9.931904456851861e-05, + "loss": 1.5248, + "step": 100000 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.76607755765641, + "eval_loss": 1.1575313806533813, + "eval_runtime": 11.3109, + "eval_samples_per_second": 442.05, + "eval_steps_per_second": 0.884, + "step": 100000 + }, + { + "epoch": 0.1, + "learning_rate": 9.929160687776363e-05, + "loss": 1.5151, + "step": 101000 + }, + { + "epoch": 0.1, + "learning_rate": 9.926363121944323e-05, + "loss": 1.3509, + "step": 102000 + }, + { + "epoch": 0.1, + "learning_rate": 9.923508908775465e-05, + "loss": 1.361, + "step": 103000 + }, + { + "epoch": 0.1, + "learning_rate": 9.920600852902e-05, + "loss": 1.2428, + "step": 104000 + }, + { + "epoch": 0.1, + "learning_rate": 9.917641974860546e-05, + "loss": 1.2109, + "step": 105000 + }, + { + "epoch": 0.11, + "learning_rate": 9.914626383334564e-05, + "loss": 1.2223, + "step": 106000 + }, + { + "epoch": 0.11, + "learning_rate": 9.911557046241845e-05, + "loss": 1.3193, + "step": 107000 + }, + { + "epoch": 0.11, + "learning_rate": 9.90843399714817e-05, + "loss": 1.2732, + "step": 108000 + }, + { + "epoch": 0.11, + "learning_rate": 9.905260473734215e-05, + "loss": 1.3843, + "step": 109000 + }, + { + "epoch": 0.11, + "learning_rate": 9.902033414410157e-05, + "loss": 1.4177, + "step": 110000 + }, + { + "epoch": 0.11, + "learning_rate": 9.898749543760183e-05, + "loss": 1.3797, + "step": 111000 + }, + { + "epoch": 0.11, + "learning_rate": 9.89541210117005e-05, + "loss": 1.1533, + "step": 112000 + }, + { + "epoch": 0.11, + "learning_rate": 9.89202112313749e-05, + "loss": 1.246, + "step": 113000 + }, + { + "epoch": 0.11, + "learning_rate": 9.888580117932034e-05, + "loss": 1.1726, + "step": 114000 + }, + { + "epoch": 0.12, + "learning_rate": 9.885082234290936e-05, + "loss": 1.0864, + "step": 115000 + }, + { + "epoch": 0.12, + "learning_rate": 9.881534506150948e-05, + "loss": 1.3172, + "step": 116000 + }, + { + "epoch": 0.12, + "learning_rate": 9.877929869757103e-05, + "loss": 1.3628, + "step": 117000 + }, + { + "epoch": 0.12, + "learning_rate": 9.874271889103715e-05, + "loss": 1.3292, + "step": 118000 + }, + { + "epoch": 0.12, + "learning_rate": 9.8705643420908e-05, + "loss": 1.3346, + "step": 119000 + }, + { + "epoch": 0.12, + "learning_rate": 9.86679984675373e-05, + "loss": 1.3594, + "step": 120000 + }, + { + "epoch": 0.12, + "learning_rate": 9.862985973161956e-05, + "loss": 1.4662, + "step": 121000 + }, + { + "epoch": 0.12, + "learning_rate": 9.859115127647795e-05, + "loss": 1.2652, + "step": 122000 + }, + { + "epoch": 0.12, + "learning_rate": 9.855195094141563e-05, + "loss": 1.3002, + "step": 123000 + }, + { + "epoch": 0.12, + "learning_rate": 9.851218067604352e-05, + "loss": 1.234, + "step": 124000 + }, + { + "epoch": 0.12, + "learning_rate": 9.847187988923588e-05, + "loss": 1.2216, + "step": 125000 + }, + { + "epoch": 0.13, + "learning_rate": 9.843104902171565e-05, + "loss": 1.4338, + "step": 126000 + }, + { + "epoch": 0.13, + "learning_rate": 9.83897301449066e-05, + "loss": 1.4297, + "step": 127000 + }, + { + "epoch": 0.13, + "learning_rate": 9.834784099026668e-05, + "loss": 1.4427, + "step": 128000 + }, + { + "epoch": 0.13, + "learning_rate": 9.830542311138371e-05, + "loss": 1.4926, + "step": 129000 + }, + { + "epoch": 0.13, + "learning_rate": 9.826252018198228e-05, + "loss": 1.4958, + "step": 130000 + }, + { + "epoch": 0.13, + "learning_rate": 9.821904677956975e-05, + "loss": 1.3699, + "step": 131000 + }, + { + "epoch": 0.13, + "learning_rate": 9.817513458876564e-05, + "loss": 1.2653, + "step": 132000 + }, + { + "epoch": 0.13, + "learning_rate": 9.813060808918262e-05, + "loss": 1.1692, + "step": 133000 + }, + { + "epoch": 0.13, + "learning_rate": 9.808555524098074e-05, + "loss": 1.1308, + "step": 134000 + }, + { + "epoch": 0.14, + "learning_rate": 9.803997653685072e-05, + "loss": 1.2077, + "step": 135000 + }, + { + "epoch": 0.14, + "learning_rate": 9.799391884154434e-05, + "loss": 1.3723, + "step": 136000 + }, + { + "epoch": 0.14, + "learning_rate": 9.794733734161261e-05, + "loss": 1.2926, + "step": 137000 + }, + { + "epoch": 0.14, + "learning_rate": 9.790018513149605e-05, + "loss": 1.3797, + "step": 138000 + }, + { + "epoch": 0.14, + "learning_rate": 9.785250909262922e-05, + "loss": 1.3112, + "step": 139000 + }, + { + "epoch": 0.14, + "learning_rate": 9.780430974638956e-05, + "loss": 1.2632, + "step": 140000 + }, + { + "epoch": 0.14, + "learning_rate": 9.775563660295586e-05, + "loss": 1.0677, + "step": 141000 + }, + { + "epoch": 0.14, + "learning_rate": 9.770639275096809e-05, + "loss": 1.1267, + "step": 142000 + }, + { + "epoch": 0.14, + "learning_rate": 9.765662718951243e-05, + "loss": 1.0702, + "step": 143000 + }, + { + "epoch": 0.14, + "learning_rate": 9.760634046281708e-05, + "loss": 0.9839, + "step": 144000 + }, + { + "epoch": 0.14, + "learning_rate": 9.755558418801454e-05, + "loss": 1.1214, + "step": 145000 + }, + { + "epoch": 0.15, + "learning_rate": 9.750425730609625e-05, + "loss": 1.3301, + "step": 146000 + }, + { + "epoch": 0.15, + "learning_rate": 9.74524109252309e-05, + "loss": 1.2632, + "step": 147000 + }, + { + "epoch": 0.15, + "learning_rate": 9.740009823673144e-05, + "loss": 1.3007, + "step": 148000 + }, + { + "epoch": 0.15, + "learning_rate": 9.734721508266946e-05, + "loss": 1.3467, + "step": 149000 + }, + { + "epoch": 0.15, + "learning_rate": 9.729386780642266e-05, + "loss": 1.3152, + "step": 150000 + }, + { + "epoch": 0.15, + "eval_accuracy": 0.767424813667675, + "eval_loss": 1.1280603408813477, + "eval_runtime": 11.4235, + "eval_samples_per_second": 437.694, + "eval_steps_per_second": 0.875, + "step": 150000 + }, + { + "epoch": 0.15, + "learning_rate": 9.723995019013185e-05, + "loss": 1.086, + "step": 151000 + }, + { + "epoch": 0.15, + "learning_rate": 9.718557065738742e-05, + "loss": 1.0641, + "step": 152000 + }, + { + "epoch": 0.15, + "learning_rate": 9.713062093504106e-05, + "loss": 1.0293, + "step": 153000 + }, + { + "epoch": 0.15, + "learning_rate": 9.707515579977516e-05, + "loss": 1.1034, + "step": 154000 + }, + { + "epoch": 0.15, + "learning_rate": 9.701917585814753e-05, + "loss": 1.338, + "step": 155000 + }, + { + "epoch": 0.16, + "learning_rate": 9.696268172234585e-05, + "loss": 1.3888, + "step": 156000 + }, + { + "epoch": 0.16, + "learning_rate": 9.690573127421747e-05, + "loss": 1.3721, + "step": 157000 + }, + { + "epoch": 0.16, + "learning_rate": 9.684826889792009e-05, + "loss": 1.4046, + "step": 158000 + }, + { + "epoch": 0.16, + "learning_rate": 9.67902369329362e-05, + "loss": 1.42, + "step": 159000 + }, + { + "epoch": 0.16, + "learning_rate": 9.673169327741768e-05, + "loss": 1.3545, + "step": 160000 + }, + { + "epoch": 0.16, + "learning_rate": 9.667263857158856e-05, + "loss": 1.2049, + "step": 161000 + }, + { + "epoch": 0.16, + "learning_rate": 9.661313328110268e-05, + "loss": 1.208, + "step": 162000 + }, + { + "epoch": 0.16, + "learning_rate": 9.655311925585616e-05, + "loss": 1.1505, + "step": 163000 + }, + { + "epoch": 0.16, + "learning_rate": 9.649253631382465e-05, + "loss": 1.1549, + "step": 164000 + }, + { + "epoch": 0.17, + "learning_rate": 9.643150628198306e-05, + "loss": 1.3294, + "step": 165000 + }, + { + "epoch": 0.17, + "learning_rate": 9.63699076456116e-05, + "loss": 1.2572, + "step": 166000 + }, + { + "epoch": 0.17, + "learning_rate": 9.630780191535927e-05, + "loss": 1.3036, + "step": 167000 + }, + { + "epoch": 0.17, + "learning_rate": 9.624518977040433e-05, + "loss": 1.3684, + "step": 168000 + }, + { + "epoch": 0.17, + "learning_rate": 9.6182135265721e-05, + "loss": 1.3411, + "step": 169000 + }, + { + "epoch": 0.17, + "learning_rate": 9.611851285573377e-05, + "loss": 1.1562, + "step": 170000 + }, + { + "epoch": 0.17, + "learning_rate": 9.605445047952064e-05, + "loss": 1.1598, + "step": 171000 + }, + { + "epoch": 0.17, + "learning_rate": 9.598982058476909e-05, + "loss": 1.0123, + "step": 172000 + }, + { + "epoch": 0.17, + "learning_rate": 9.592468775270784e-05, + "loss": 0.9223, + "step": 173000 + }, + { + "epoch": 0.17, + "learning_rate": 9.585905269561911e-05, + "loss": 0.9687, + "step": 174000 + }, + { + "epoch": 0.17, + "learning_rate": 9.579291613127737e-05, + "loss": 1.2193, + "step": 175000 + }, + { + "epoch": 0.18, + "learning_rate": 9.572634567018936e-05, + "loss": 1.1626, + "step": 176000 + }, + { + "epoch": 0.18, + "learning_rate": 9.565920876628414e-05, + "loss": 1.1968, + "step": 177000 + }, + { + "epoch": 0.18, + "learning_rate": 9.559164042597832e-05, + "loss": 1.2435, + "step": 178000 + }, + { + "epoch": 0.18, + "learning_rate": 9.552350611635926e-05, + "loss": 1.2996, + "step": 179000 + }, + { + "epoch": 0.18, + "learning_rate": 9.545487396897081e-05, + "loss": 1.0764, + "step": 180000 + }, + { + "epoch": 0.18, + "learning_rate": 9.53859528647097e-05, + "loss": 1.0733, + "step": 181000 + }, + { + "epoch": 0.18, + "learning_rate": 9.531632878672679e-05, + "loss": 1.0556, + "step": 182000 + }, + { + "epoch": 0.18, + "learning_rate": 9.52462091366326e-05, + "loss": 0.9726, + "step": 183000 + }, + { + "epoch": 0.18, + "learning_rate": 9.517559468124436e-05, + "loss": 1.1002, + "step": 184000 + }, + { + "epoch": 0.18, + "learning_rate": 9.510448619279047e-05, + "loss": 1.1708, + "step": 185000 + }, + { + "epoch": 0.19, + "learning_rate": 9.503295629676674e-05, + "loss": 1.1752, + "step": 186000 + }, + { + "epoch": 0.19, + "learning_rate": 9.496086257254829e-05, + "loss": 1.3322, + "step": 187000 + }, + { + "epoch": 0.19, + "learning_rate": 9.48882771635401e-05, + "loss": 1.3658, + "step": 188000 + }, + { + "epoch": 0.19, + "learning_rate": 9.48152008635246e-05, + "loss": 1.3593, + "step": 189000 + }, + { + "epoch": 0.19, + "learning_rate": 9.474170828257791e-05, + "loss": 1.1516, + "step": 190000 + }, + { + "epoch": 0.19, + "learning_rate": 9.466765309224318e-05, + "loss": 1.1632, + "step": 191000 + }, + { + "epoch": 0.19, + "learning_rate": 9.459310942361059e-05, + "loss": 1.1145, + "step": 192000 + }, + { + "epoch": 0.19, + "learning_rate": 9.45181533665246e-05, + "loss": 1.074, + "step": 193000 + }, + { + "epoch": 0.19, + "learning_rate": 9.444263567865316e-05, + "loss": 1.2866, + "step": 194000 + }, + { + "epoch": 0.2, + "learning_rate": 9.436670821943329e-05, + "loss": 1.3152, + "step": 195000 + }, + { + "epoch": 0.2, + "learning_rate": 9.429029654288884e-05, + "loss": 1.3184, + "step": 196000 + }, + { + "epoch": 0.2, + "learning_rate": 9.421332426904754e-05, + "loss": 1.3529, + "step": 197000 + }, + { + "epoch": 0.2, + "learning_rate": 9.413586848537565e-05, + "loss": 1.3308, + "step": 198000 + }, + { + "epoch": 0.2, + "learning_rate": 9.405793003891719e-05, + "loss": 1.1764, + "step": 199000 + }, + { + "epoch": 0.2, + "learning_rate": 9.39795097819945e-05, + "loss": 1.1239, + "step": 200000 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.7971212829130808, + "eval_loss": 0.9457727670669556, + "eval_runtime": 11.3916, + "eval_samples_per_second": 438.92, + "eval_steps_per_second": 0.878, + "step": 200000 + }, + { + "epoch": 0.2, + "learning_rate": 9.3900608572199e-05, + "loss": 1.114, + "step": 201000 + }, + { + "epoch": 0.2, + "learning_rate": 9.382122727238165e-05, + "loss": 1.0155, + "step": 202000 + }, + { + "epoch": 0.2, + "learning_rate": 9.374136675064364e-05, + "loss": 1.0197, + "step": 203000 + }, + { + "epoch": 0.2, + "learning_rate": 9.366110845784027e-05, + "loss": 1.2245, + "step": 204000 + }, + { + "epoch": 0.2, + "learning_rate": 9.358029259454708e-05, + "loss": 1.1102, + "step": 205000 + }, + { + "epoch": 0.21, + "learning_rate": 9.349900014415627e-05, + "loss": 1.115, + "step": 206000 + }, + { + "epoch": 0.21, + "learning_rate": 9.341731400113154e-05, + "loss": 1.1427, + "step": 207000 + }, + { + "epoch": 0.21, + "learning_rate": 9.333507152310676e-05, + "loss": 1.2215, + "step": 208000 + }, + { + "epoch": 0.21, + "learning_rate": 9.325235513968268e-05, + "loss": 1.0317, + "step": 209000 + }, + { + "epoch": 0.21, + "learning_rate": 9.316924918077827e-05, + "loss": 1.0061, + "step": 210000 + }, + { + "epoch": 0.21, + "learning_rate": 9.30855881770826e-05, + "loss": 0.9843, + "step": 211000 + }, + { + "epoch": 0.21, + "learning_rate": 9.300145599629752e-05, + "loss": 0.9863, + "step": 212000 + }, + { + "epoch": 0.21, + "learning_rate": 9.291685355847915e-05, + "loss": 1.0236, + "step": 213000 + }, + { + "epoch": 0.21, + "learning_rate": 9.283186709471781e-05, + "loss": 1.1892, + "step": 214000 + }, + { + "epoch": 0.21, + "learning_rate": 9.274641316485581e-05, + "loss": 1.17, + "step": 215000 + }, + { + "epoch": 0.22, + "learning_rate": 9.266049270153557e-05, + "loss": 1.1997, + "step": 216000 + }, + { + "epoch": 0.22, + "learning_rate": 9.257401993701012e-05, + "loss": 1.1973, + "step": 217000 + }, + { + "epoch": 0.22, + "learning_rate": 9.248708158982357e-05, + "loss": 1.1667, + "step": 218000 + }, + { + "epoch": 0.22, + "learning_rate": 9.23996786107198e-05, + "loss": 1.0233, + "step": 219000 + }, + { + "epoch": 0.22, + "learning_rate": 9.231190005346579e-05, + "loss": 1.118, + "step": 220000 + }, + { + "epoch": 0.22, + "learning_rate": 9.222374826427257e-05, + "loss": 1.1016, + "step": 221000 + }, + { + "epoch": 0.22, + "learning_rate": 9.21349585284373e-05, + "loss": 1.0464, + "step": 222000 + }, + { + "epoch": 0.22, + "learning_rate": 9.204570801144619e-05, + "loss": 1.2016, + "step": 223000 + }, + { + "epoch": 0.22, + "learning_rate": 9.195599768932858e-05, + "loss": 1.2954, + "step": 224000 + }, + { + "epoch": 0.23, + "learning_rate": 9.18660093386864e-05, + "loss": 1.2599, + "step": 225000 + }, + { + "epoch": 0.23, + "learning_rate": 9.177538326919512e-05, + "loss": 1.3015, + "step": 226000 + }, + { + "epoch": 0.23, + "learning_rate": 9.16843003508054e-05, + "loss": 1.3161, + "step": 227000 + }, + { + "epoch": 0.23, + "learning_rate": 9.159276157958547e-05, + "loss": 1.308, + "step": 228000 + }, + { + "epoch": 0.23, + "learning_rate": 9.150076795658864e-05, + "loss": 1.1081, + "step": 229000 + }, + { + "epoch": 0.23, + "learning_rate": 9.140832048784247e-05, + "loss": 1.1327, + "step": 230000 + }, + { + "epoch": 0.23, + "learning_rate": 9.131542018433763e-05, + "loss": 1.0167, + "step": 231000 + }, + { + "epoch": 0.23, + "learning_rate": 9.122206806201698e-05, + "loss": 0.9574, + "step": 232000 + }, + { + "epoch": 0.23, + "learning_rate": 9.112835916951715e-05, + "loss": 1.2086, + "step": 233000 + }, + { + "epoch": 0.23, + "learning_rate": 9.103410692640435e-05, + "loss": 1.231, + "step": 234000 + }, + { + "epoch": 0.23, + "learning_rate": 9.093940594087245e-05, + "loss": 1.181, + "step": 235000 + }, + { + "epoch": 0.24, + "learning_rate": 9.084435262053219e-05, + "loss": 1.2146, + "step": 236000 + }, + { + "epoch": 0.24, + "learning_rate": 9.074875770810744e-05, + "loss": 1.2343, + "step": 237000 + }, + { + "epoch": 0.24, + "learning_rate": 9.065271717379691e-05, + "loss": 1.0341, + "step": 238000 + }, + { + "epoch": 0.24, + "learning_rate": 9.055642548107829e-05, + "loss": 0.9415, + "step": 239000 + }, + { + "epoch": 0.24, + "learning_rate": 9.045949774468634e-05, + "loss": 0.9071, + "step": 240000 + }, + { + "epoch": 0.24, + "learning_rate": 9.036212754970922e-05, + "loss": 0.9272, + "step": 241000 + }, + { + "epoch": 0.24, + "learning_rate": 9.026431596097182e-05, + "loss": 0.932, + "step": 242000 + }, + { + "epoch": 0.24, + "learning_rate": 9.01661625196235e-05, + "loss": 1.0876, + "step": 243000 + }, + { + "epoch": 0.24, + "learning_rate": 9.006747179584774e-05, + "loss": 1.1039, + "step": 244000 + }, + { + "epoch": 0.24, + "learning_rate": 8.99683429006198e-05, + "loss": 1.1908, + "step": 245000 + }, + { + "epoch": 0.25, + "learning_rate": 8.986887670194317e-05, + "loss": 1.1791, + "step": 246000 + }, + { + "epoch": 0.25, + "learning_rate": 8.976887515621704e-05, + "loss": 1.1991, + "step": 247000 + }, + { + "epoch": 0.25, + "learning_rate": 8.966853935776588e-05, + "loss": 1.0425, + "step": 248000 + }, + { + "epoch": 0.25, + "learning_rate": 8.956777061770676e-05, + "loss": 1.0149, + "step": 249000 + }, + { + "epoch": 0.25, + "learning_rate": 8.946646851795914e-05, + "loss": 0.9472, + "step": 250000 + }, + { + "epoch": 0.25, + "eval_accuracy": 0.7875509927594045, + "eval_loss": 0.9979193806648254, + "eval_runtime": 10.7394, + "eval_samples_per_second": 465.574, + "eval_steps_per_second": 0.931, + "step": 250000 + }, + { + "epoch": 0.25, + "learning_rate": 8.936473481923489e-05, + "loss": 0.8647, + "step": 251000 + }, + { + "epoch": 0.25, + "learning_rate": 8.926257063407744e-05, + "loss": 1.0154, + "step": 252000 + }, + { + "epoch": 0.25, + "learning_rate": 8.916007988738915e-05, + "loss": 1.2556, + "step": 253000 + }, + { + "epoch": 0.25, + "learning_rate": 8.905705851349957e-05, + "loss": 1.2242, + "step": 254000 + }, + { + "epoch": 0.26, + "learning_rate": 8.895361001787558e-05, + "loss": 1.2757, + "step": 255000 + }, + { + "epoch": 0.26, + "learning_rate": 8.884973553181336e-05, + "loss": 1.2906, + "step": 256000 + }, + { + "epoch": 0.26, + "learning_rate": 8.874554070244375e-05, + "loss": 1.2885, + "step": 257000 + }, + { + "epoch": 0.26, + "learning_rate": 8.864092300505416e-05, + "loss": 1.0966, + "step": 258000 + }, + { + "epoch": 0.26, + "learning_rate": 8.853577822596901e-05, + "loss": 1.0932, + "step": 259000 + }, + { + "epoch": 0.26, + "learning_rate": 8.843021202578716e-05, + "loss": 1.0595, + "step": 260000 + }, + { + "epoch": 0.26, + "learning_rate": 8.83243317549683e-05, + "loss": 1.0109, + "step": 261000 + }, + { + "epoch": 0.26, + "learning_rate": 8.821792659908128e-05, + "loss": 1.1995, + "step": 262000 + }, + { + "epoch": 0.26, + "learning_rate": 8.811110349807205e-05, + "loss": 1.2683, + "step": 263000 + }, + { + "epoch": 0.26, + "learning_rate": 8.800397106780893e-05, + "loss": 1.1763, + "step": 264000 + }, + { + "epoch": 0.27, + "learning_rate": 8.789631600073084e-05, + "loss": 1.1731, + "step": 265000 + }, + { + "epoch": 0.27, + "learning_rate": 8.778835478172145e-05, + "loss": 1.2495, + "step": 266000 + }, + { + "epoch": 0.27, + "learning_rate": 8.76798724530479e-05, + "loss": 1.2205, + "step": 267000 + }, + { + "epoch": 0.27, + "learning_rate": 8.757097806332607e-05, + "loss": 0.9901, + "step": 268000 + }, + { + "epoch": 0.27, + "learning_rate": 8.746167280340752e-05, + "loss": 1.0082, + "step": 269000 + }, + { + "epoch": 0.27, + "learning_rate": 8.735217770656539e-05, + "loss": 0.9372, + "step": 270000 + }, + { + "epoch": 0.27, + "learning_rate": 8.724205511251769e-05, + "loss": 0.8586, + "step": 271000 + }, + { + "epoch": 0.27, + "learning_rate": 8.713152524532197e-05, + "loss": 1.0177, + "step": 272000 + }, + { + "epoch": 0.27, + "learning_rate": 8.702058931371517e-05, + "loss": 1.0323, + "step": 273000 + }, + { + "epoch": 0.27, + "learning_rate": 8.6909360073476e-05, + "loss": 1.11, + "step": 274000 + }, + { + "epoch": 0.28, + "learning_rate": 8.67976160600311e-05, + "loss": 1.1162, + "step": 275000 + }, + { + "epoch": 0.28, + "learning_rate": 8.668558198077614e-05, + "loss": 1.1175, + "step": 276000 + }, + { + "epoch": 0.28, + "learning_rate": 8.65730347686515e-05, + "loss": 1.0497, + "step": 277000 + }, + { + "epoch": 0.28, + "learning_rate": 8.646008759967813e-05, + "loss": 1.0092, + "step": 278000 + }, + { + "epoch": 0.28, + "learning_rate": 8.634674170902821e-05, + "loss": 0.9418, + "step": 279000 + }, + { + "epoch": 0.28, + "learning_rate": 8.623311227773569e-05, + "loss": 0.9255, + "step": 280000 + }, + { + "epoch": 0.28, + "learning_rate": 8.611897306229244e-05, + "loss": 0.9643, + "step": 281000 + }, + { + "epoch": 0.28, + "learning_rate": 8.60045535866349e-05, + "loss": 1.1115, + "step": 282000 + }, + { + "epoch": 0.28, + "learning_rate": 8.58896260342258e-05, + "loss": 1.0597, + "step": 283000 + }, + { + "epoch": 0.28, + "learning_rate": 8.577442151427801e-05, + "loss": 1.0903, + "step": 284000 + }, + { + "epoch": 0.28, + "learning_rate": 8.565871064718052e-05, + "loss": 1.1758, + "step": 285000 + }, + { + "epoch": 0.29, + "learning_rate": 8.554272611732321e-05, + "loss": 1.2468, + "step": 286000 + }, + { + "epoch": 0.29, + "learning_rate": 8.542623699204519e-05, + "loss": 1.0986, + "step": 287000 + }, + { + "epoch": 0.29, + "learning_rate": 8.53093604511152e-05, + "loss": 1.0916, + "step": 288000 + }, + { + "epoch": 0.29, + "learning_rate": 8.519209777267638e-05, + "loss": 1.0491, + "step": 289000 + }, + { + "epoch": 0.29, + "learning_rate": 8.507456807843545e-05, + "loss": 1.0027, + "step": 290000 + }, + { + "epoch": 0.29, + "learning_rate": 8.495653735920981e-05, + "loss": 1.106, + "step": 291000 + }, + { + "epoch": 0.29, + "learning_rate": 8.483824296440821e-05, + "loss": 1.2315, + "step": 292000 + }, + { + "epoch": 0.29, + "learning_rate": 8.471944936228e-05, + "loss": 1.1994, + "step": 293000 + }, + { + "epoch": 0.29, + "learning_rate": 8.460051479845447e-05, + "loss": 1.2447, + "step": 294000 + }, + { + "epoch": 0.29, + "learning_rate": 8.448096388237765e-05, + "loss": 1.2671, + "step": 295000 + }, + { + "epoch": 0.3, + "learning_rate": 8.436103588800417e-05, + "loss": 1.2646, + "step": 296000 + }, + { + "epoch": 0.3, + "learning_rate": 8.424085261786644e-05, + "loss": 0.9903, + "step": 297000 + }, + { + "epoch": 0.3, + "learning_rate": 8.412017477934155e-05, + "loss": 0.9825, + "step": 298000 + }, + { + "epoch": 0.3, + "learning_rate": 8.399912380805182e-05, + "loss": 0.9979, + "step": 299000 + }, + { + "epoch": 0.3, + "learning_rate": 8.387770102779126e-05, + "loss": 0.961, + "step": 300000 + }, + { + "epoch": 0.3, + "eval_accuracy": 0.8074508577716577, + "eval_loss": 0.8797607421875, + "eval_runtime": 10.8875, + "eval_samples_per_second": 459.241, + "eval_steps_per_second": 0.918, + "step": 300000 + }, + { + "epoch": 0.3, + "learning_rate": 8.375602974429366e-05, + "loss": 1.073, + "step": 301000 + }, + { + "epoch": 0.3, + "learning_rate": 8.363399004819421e-05, + "loss": 1.144, + "step": 302000 + }, + { + "epoch": 0.3, + "learning_rate": 8.351146055866368e-05, + "loss": 1.0968, + "step": 303000 + }, + { + "epoch": 0.3, + "learning_rate": 8.338856459316948e-05, + "loss": 1.0641, + "step": 304000 + }, + { + "epoch": 0.3, + "learning_rate": 8.326530349568222e-05, + "loss": 1.064, + "step": 305000 + }, + { + "epoch": 0.31, + "learning_rate": 8.31416786141655e-05, + "loss": 1.0356, + "step": 306000 + }, + { + "epoch": 0.31, + "learning_rate": 8.301781546845868e-05, + "loss": 0.9338, + "step": 307000 + }, + { + "epoch": 0.31, + "learning_rate": 8.289346743906984e-05, + "loss": 0.9026, + "step": 308000 + }, + { + "epoch": 0.31, + "learning_rate": 8.276888457896356e-05, + "loss": 0.8446, + "step": 309000 + }, + { + "epoch": 0.31, + "learning_rate": 8.264381883565022e-05, + "loss": 0.9099, + "step": 310000 + }, + { + "epoch": 0.31, + "learning_rate": 8.251839610476409e-05, + "loss": 1.1068, + "step": 311000 + }, + { + "epoch": 0.31, + "learning_rate": 8.239261775790807e-05, + "loss": 1.0591, + "step": 312000 + }, + { + "epoch": 0.31, + "learning_rate": 8.226648517057399e-05, + "loss": 1.1183, + "step": 313000 + }, + { + "epoch": 0.31, + "learning_rate": 8.213999972212765e-05, + "loss": 1.146, + "step": 314000 + }, + { + "epoch": 0.32, + "learning_rate": 8.201328980782183e-05, + "loss": 1.1369, + "step": 315000 + }, + { + "epoch": 0.32, + "learning_rate": 8.188610314006513e-05, + "loss": 0.9698, + "step": 316000 + }, + { + "epoch": 0.32, + "learning_rate": 8.175856777099312e-05, + "loss": 0.9333, + "step": 317000 + }, + { + "epoch": 0.32, + "learning_rate": 8.163068509531211e-05, + "loss": 0.9574, + "step": 318000 + }, + { + "epoch": 0.32, + "learning_rate": 8.150271331297903e-05, + "loss": 0.9808, + "step": 319000 + }, + { + "epoch": 0.32, + "learning_rate": 8.137414091098588e-05, + "loss": 1.0482, + "step": 320000 + }, + { + "epoch": 0.32, + "learning_rate": 8.124535449282972e-05, + "loss": 1.2092, + "step": 321000 + }, + { + "epoch": 0.32, + "learning_rate": 8.111622706352883e-05, + "loss": 1.1759, + "step": 322000 + }, + { + "epoch": 0.32, + "learning_rate": 8.098663026608033e-05, + "loss": 1.2193, + "step": 323000 + }, + { + "epoch": 0.32, + "learning_rate": 8.085669460380775e-05, + "loss": 1.2402, + "step": 324000 + }, + { + "epoch": 0.33, + "learning_rate": 8.072642149766666e-05, + "loss": 1.2365, + "step": 325000 + }, + { + "epoch": 0.33, + "learning_rate": 8.059581237230287e-05, + "loss": 1.0715, + "step": 326000 + }, + { + "epoch": 0.33, + "learning_rate": 8.046486865603688e-05, + "loss": 1.0666, + "step": 327000 + }, + { + "epoch": 0.33, + "learning_rate": 8.033359178084814e-05, + "loss": 1.0308, + "step": 328000 + }, + { + "epoch": 0.33, + "learning_rate": 8.020211495617513e-05, + "loss": 0.9836, + "step": 329000 + }, + { + "epoch": 0.33, + "learning_rate": 8.007017640320121e-05, + "loss": 1.0282, + "step": 330000 + }, + { + "epoch": 0.33, + "learning_rate": 7.993804143876718e-05, + "loss": 1.1226, + "step": 331000 + }, + { + "epoch": 0.33, + "learning_rate": 7.980544697365782e-05, + "loss": 1.1466, + "step": 332000 + }, + { + "epoch": 0.33, + "learning_rate": 7.967265964369563e-05, + "loss": 1.1792, + "step": 333000 + }, + { + "epoch": 0.33, + "learning_rate": 7.953954846694738e-05, + "loss": 1.134, + "step": 334000 + }, + { + "epoch": 0.34, + "learning_rate": 7.940598116801423e-05, + "loss": 1.159, + "step": 335000 + }, + { + "epoch": 0.34, + "learning_rate": 7.927209228998228e-05, + "loss": 0.9188, + "step": 336000 + }, + { + "epoch": 0.34, + "learning_rate": 7.913801766544098e-05, + "loss": 0.8693, + "step": 337000 + }, + { + "epoch": 0.34, + "learning_rate": 7.900349034318708e-05, + "loss": 0.8056, + "step": 338000 + }, + { + "epoch": 0.34, + "learning_rate": 7.886864584340953e-05, + "loss": 0.808, + "step": 339000 + }, + { + "epoch": 0.34, + "learning_rate": 7.873348564074622e-05, + "loss": 1.0187, + "step": 340000 + }, + { + "epoch": 0.34, + "learning_rate": 7.859828247475456e-05, + "loss": 1.0268, + "step": 341000 + }, + { + "epoch": 0.34, + "learning_rate": 7.846249592803267e-05, + "loss": 1.0245, + "step": 342000 + }, + { + "epoch": 0.34, + "learning_rate": 7.832653437280335e-05, + "loss": 1.1168, + "step": 343000 + }, + { + "epoch": 0.34, + "learning_rate": 7.819012710086294e-05, + "loss": 1.1404, + "step": 344000 + }, + { + "epoch": 0.34, + "learning_rate": 7.8053411546209e-05, + "loss": 1.0336, + "step": 345000 + }, + { + "epoch": 0.35, + "learning_rate": 7.791638920394093e-05, + "loss": 0.9649, + "step": 346000 + }, + { + "epoch": 0.35, + "learning_rate": 7.777906157251316e-05, + "loss": 0.9322, + "step": 347000 + }, + { + "epoch": 0.35, + "learning_rate": 7.76414301537186e-05, + "loss": 0.8737, + "step": 348000 + }, + { + "epoch": 0.35, + "learning_rate": 7.750363453686169e-05, + "loss": 0.855, + "step": 349000 + }, + { + "epoch": 0.35, + "learning_rate": 7.736567712952333e-05, + "loss": 1.0179, + "step": 350000 + }, + { + "epoch": 0.35, + "eval_accuracy": 0.8018313508243486, + "eval_loss": 0.9101699590682983, + "eval_runtime": 11.7713, + "eval_samples_per_second": 424.764, + "eval_steps_per_second": 0.85, + "step": 350000 + }, + { + "epoch": 0.35, + "learning_rate": 7.722714428804807e-05, + "loss": 1.096, + "step": 351000 + }, + { + "epoch": 0.35, + "learning_rate": 7.708831369488505e-05, + "loss": 1.1769, + "step": 352000 + }, + { + "epoch": 0.35, + "learning_rate": 7.694918686826337e-05, + "loss": 1.218, + "step": 353000 + }, + { + "epoch": 0.35, + "learning_rate": 7.680976532965179e-05, + "loss": 1.2225, + "step": 354000 + }, + { + "epoch": 0.35, + "learning_rate": 7.66701904644065e-05, + "loss": 1.0765, + "step": 355000 + }, + { + "epoch": 0.36, + "learning_rate": 7.653018436999124e-05, + "loss": 1.0637, + "step": 356000 + }, + { + "epoch": 0.36, + "learning_rate": 7.638988814573021e-05, + "loss": 1.0175, + "step": 357000 + }, + { + "epoch": 0.36, + "learning_rate": 7.624930332588043e-05, + "loss": 0.9619, + "step": 358000 + }, + { + "epoch": 0.36, + "learning_rate": 7.61085724626061e-05, + "loss": 1.0432, + "step": 359000 + }, + { + "epoch": 0.36, + "learning_rate": 7.596755665091857e-05, + "loss": 1.1913, + "step": 360000 + }, + { + "epoch": 0.36, + "learning_rate": 7.582625742986147e-05, + "loss": 1.1628, + "step": 361000 + }, + { + "epoch": 0.36, + "learning_rate": 7.568453447806063e-05, + "loss": 1.2034, + "step": 362000 + }, + { + "epoch": 0.36, + "learning_rate": 7.554253064430575e-05, + "loss": 1.1454, + "step": 363000 + }, + { + "epoch": 0.36, + "learning_rate": 7.540024748152803e-05, + "loss": 1.1433, + "step": 364000 + }, + { + "epoch": 0.36, + "learning_rate": 7.525768654571332e-05, + "loss": 1.0084, + "step": 365000 + }, + { + "epoch": 0.37, + "learning_rate": 7.511484939588518e-05, + "loss": 0.9967, + "step": 366000 + }, + { + "epoch": 0.37, + "learning_rate": 7.497188084255762e-05, + "loss": 0.9075, + "step": 367000 + }, + { + "epoch": 0.37, + "learning_rate": 7.482849622614294e-05, + "loss": 0.8931, + "step": 368000 + }, + { + "epoch": 0.37, + "learning_rate": 7.468512767147887e-05, + "loss": 0.9603, + "step": 369000 + }, + { + "epoch": 0.37, + "learning_rate": 7.454120212348005e-05, + "loss": 0.9906, + "step": 370000 + }, + { + "epoch": 0.37, + "learning_rate": 7.43970081968267e-05, + "loss": 0.9656, + "step": 371000 + }, + { + "epoch": 0.37, + "learning_rate": 7.425254746840043e-05, + "loss": 1.0444, + "step": 372000 + }, + { + "epoch": 0.37, + "learning_rate": 7.410796637590287e-05, + "loss": 1.0772, + "step": 373000 + }, + { + "epoch": 0.37, + "learning_rate": 7.396297704907714e-05, + "loss": 1.0469, + "step": 374000 + }, + { + "epoch": 0.38, + "learning_rate": 7.381772566697359e-05, + "loss": 0.8572, + "step": 375000 + }, + { + "epoch": 0.38, + "learning_rate": 7.36722138180381e-05, + "loss": 0.936, + "step": 376000 + }, + { + "epoch": 0.38, + "learning_rate": 7.352658899306739e-05, + "loss": 0.8848, + "step": 377000 + }, + { + "epoch": 0.38, + "learning_rate": 7.338070739939699e-05, + "loss": 0.8367, + "step": 378000 + }, + { + "epoch": 0.38, + "learning_rate": 7.323442421881121e-05, + "loss": 1.056, + "step": 379000 + }, + { + "epoch": 0.38, + "learning_rate": 7.308788695028466e-05, + "loss": 1.0573, + "step": 380000 + }, + { + "epoch": 0.38, + "learning_rate": 7.294109719632544e-05, + "loss": 1.0499, + "step": 381000 + }, + { + "epoch": 0.38, + "learning_rate": 7.279420372761656e-05, + "loss": 1.0515, + "step": 382000 + }, + { + "epoch": 0.38, + "learning_rate": 7.26469140698113e-05, + "loss": 1.0599, + "step": 383000 + }, + { + "epoch": 0.38, + "learning_rate": 7.249952440947382e-05, + "loss": 1.0554, + "step": 384000 + }, + { + "epoch": 0.39, + "learning_rate": 7.235174128430801e-05, + "loss": 1.04, + "step": 385000 + }, + { + "epoch": 0.39, + "learning_rate": 7.220371372408057e-05, + "loss": 1.0006, + "step": 386000 + }, + { + "epoch": 0.39, + "learning_rate": 7.20554433475972e-05, + "loss": 0.9656, + "step": 387000 + }, + { + "epoch": 0.39, + "learning_rate": 7.190708040782669e-05, + "loss": 0.9891, + "step": 388000 + }, + { + "epoch": 0.39, + "learning_rate": 7.175832950461096e-05, + "loss": 1.1548, + "step": 389000 + }, + { + "epoch": 0.39, + "learning_rate": 7.16094897629503e-05, + "loss": 1.1437, + "step": 390000 + }, + { + "epoch": 0.39, + "learning_rate": 7.146041417577084e-05, + "loss": 1.1659, + "step": 391000 + }, + { + "epoch": 0.39, + "learning_rate": 7.131095479401597e-05, + "loss": 1.2044, + "step": 392000 + }, + { + "epoch": 0.39, + "learning_rate": 7.116126235907376e-05, + "loss": 1.2036, + "step": 393000 + }, + { + "epoch": 0.39, + "learning_rate": 7.101133850795673e-05, + "loss": 1.0658, + "step": 394000 + }, + { + "epoch": 0.4, + "learning_rate": 7.086118488020812e-05, + "loss": 1.0297, + "step": 395000 + }, + { + "epoch": 0.4, + "learning_rate": 7.071095361305229e-05, + "loss": 0.9179, + "step": 396000 + }, + { + "epoch": 0.4, + "learning_rate": 7.056034558637109e-05, + "loss": 0.8803, + "step": 397000 + }, + { + "epoch": 0.4, + "learning_rate": 7.040966365967704e-05, + "loss": 0.9978, + "step": 398000 + }, + { + "epoch": 0.4, + "learning_rate": 7.025860781555982e-05, + "loss": 1.1169, + "step": 399000 + }, + { + "epoch": 0.4, + "learning_rate": 7.010748181405352e-05, + "loss": 1.037, + "step": 400000 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.8194723236238552, + "eval_loss": 0.810663104057312, + "eval_runtime": 10.4613, + "eval_samples_per_second": 477.953, + "eval_steps_per_second": 0.956, + "step": 400000 + }, + { + "epoch": 0.4, + "learning_rate": 6.995598475356628e-05, + "loss": 1.1173, + "step": 401000 + }, + { + "epoch": 0.4, + "learning_rate": 6.980442128137954e-05, + "loss": 1.0554, + "step": 402000 + }, + { + "epoch": 0.4, + "learning_rate": 6.965248962486922e-05, + "loss": 1.0148, + "step": 403000 + }, + { + "epoch": 0.4, + "learning_rate": 6.950049530525111e-05, + "loss": 0.8313, + "step": 404000 + }, + { + "epoch": 0.41, + "learning_rate": 6.934813569205621e-05, + "loss": 0.8747, + "step": 405000 + }, + { + "epoch": 0.41, + "learning_rate": 6.919571716708366e-05, + "loss": 0.8472, + "step": 406000 + }, + { + "epoch": 0.41, + "learning_rate": 6.904293625524425e-05, + "loss": 0.7775, + "step": 407000 + }, + { + "epoch": 0.41, + "learning_rate": 6.88899470929049e-05, + "loss": 0.9395, + "step": 408000 + }, + { + "epoch": 0.41, + "learning_rate": 6.873675135313058e-05, + "loss": 1.0758, + "step": 409000 + }, + { + "epoch": 0.41, + "learning_rate": 6.858350421367768e-05, + "loss": 1.041, + "step": 410000 + }, + { + "epoch": 0.41, + "learning_rate": 6.842990054963225e-05, + "loss": 1.0575, + "step": 411000 + }, + { + "epoch": 0.41, + "learning_rate": 6.827624924447044e-05, + "loss": 1.1056, + "step": 412000 + }, + { + "epoch": 0.41, + "learning_rate": 6.812224436855836e-05, + "loss": 1.075, + "step": 413000 + }, + { + "epoch": 0.41, + "learning_rate": 6.796819561217397e-05, + "loss": 0.8725, + "step": 414000 + }, + { + "epoch": 0.41, + "learning_rate": 6.781379625433246e-05, + "loss": 0.8658, + "step": 415000 + }, + { + "epoch": 0.42, + "learning_rate": 6.765920208766913e-05, + "loss": 0.8149, + "step": 416000 + }, + { + "epoch": 0.42, + "learning_rate": 6.75045696859852e-05, + "loss": 0.9049, + "step": 417000 + }, + { + "epoch": 0.42, + "learning_rate": 6.734959116622053e-05, + "loss": 1.1137, + "step": 418000 + }, + { + "epoch": 0.42, + "learning_rate": 6.719457817656532e-05, + "loss": 1.1364, + "step": 419000 + }, + { + "epoch": 0.42, + "learning_rate": 6.703922207618428e-05, + "loss": 1.146, + "step": 420000 + }, + { + "epoch": 0.42, + "learning_rate": 6.688383527254907e-05, + "loss": 1.1727, + "step": 421000 + }, + { + "epoch": 0.42, + "learning_rate": 6.672810838053671e-05, + "loss": 1.19, + "step": 422000 + }, + { + "epoch": 0.42, + "learning_rate": 6.657251055370414e-05, + "loss": 1.0855, + "step": 423000 + }, + { + "epoch": 0.42, + "learning_rate": 6.641641985562924e-05, + "loss": 1.019, + "step": 424000 + }, + { + "epoch": 0.42, + "learning_rate": 6.626014963021694e-05, + "loss": 0.9942, + "step": 425000 + }, + { + "epoch": 0.43, + "learning_rate": 6.61037015864134e-05, + "loss": 0.9554, + "step": 426000 + }, + { + "epoch": 0.43, + "learning_rate": 6.594707743510933e-05, + "loss": 0.9881, + "step": 427000 + }, + { + "epoch": 0.43, + "learning_rate": 6.57902788891214e-05, + "loss": 1.1244, + "step": 428000 + }, + { + "epoch": 0.43, + "learning_rate": 6.563346472008175e-05, + "loss": 1.0419, + "step": 429000 + }, + { + "epoch": 0.43, + "learning_rate": 6.547632270089105e-05, + "loss": 1.0964, + "step": 430000 + }, + { + "epoch": 0.43, + "learning_rate": 6.531916883034684e-05, + "loss": 1.1526, + "step": 431000 + }, + { + "epoch": 0.43, + "learning_rate": 6.516169020498223e-05, + "loss": 1.1175, + "step": 432000 + }, + { + "epoch": 0.43, + "learning_rate": 6.500420350048018e-05, + "loss": 0.945, + "step": 433000 + }, + { + "epoch": 0.43, + "learning_rate": 6.48465530404153e-05, + "loss": 0.9563, + "step": 434000 + }, + { + "epoch": 0.43, + "learning_rate": 6.468858249431153e-05, + "loss": 0.8274, + "step": 435000 + }, + { + "epoch": 0.44, + "learning_rate": 6.453045131622291e-05, + "loss": 0.7477, + "step": 436000 + }, + { + "epoch": 0.44, + "learning_rate": 6.437216123544669e-05, + "loss": 0.835, + "step": 437000 + }, + { + "epoch": 0.44, + "learning_rate": 6.421403103322858e-05, + "loss": 1.0134, + "step": 438000 + }, + { + "epoch": 0.44, + "learning_rate": 6.40554286510482e-05, + "loss": 0.973, + "step": 439000 + }, + { + "epoch": 0.44, + "learning_rate": 6.38968313932415e-05, + "loss": 1.0058, + "step": 440000 + }, + { + "epoch": 0.44, + "learning_rate": 6.373792348246463e-05, + "loss": 1.0381, + "step": 441000 + }, + { + "epoch": 0.44, + "learning_rate": 6.357886533595756e-05, + "loss": 1.0955, + "step": 442000 + }, + { + "epoch": 0.44, + "learning_rate": 6.341965869315472e-05, + "loss": 0.8979, + "step": 443000 + }, + { + "epoch": 0.44, + "learning_rate": 6.326030529511443e-05, + "loss": 0.8869, + "step": 444000 + }, + { + "epoch": 0.45, + "learning_rate": 6.310096645476388e-05, + "loss": 0.8761, + "step": 445000 + }, + { + "epoch": 0.45, + "learning_rate": 6.294132491822075e-05, + "loss": 0.8092, + "step": 446000 + }, + { + "epoch": 0.45, + "learning_rate": 6.278170171059243e-05, + "loss": 0.9651, + "step": 447000 + }, + { + "epoch": 0.45, + "learning_rate": 6.2621779011803e-05, + "loss": 0.9916, + "step": 448000 + }, + { + "epoch": 0.45, + "learning_rate": 6.24618784123524e-05, + "loss": 0.9917, + "step": 449000 + }, + { + "epoch": 0.45, + "learning_rate": 6.230168153985983e-05, + "loss": 1.1206, + "step": 450000 + }, + { + "epoch": 0.45, + "eval_accuracy": 0.8152173260941297, + "eval_loss": 0.8323125839233398, + "eval_runtime": 10.9195, + "eval_samples_per_second": 457.898, + "eval_steps_per_second": 0.916, + "step": 450000 + }, + { + "epoch": 0.45, + "learning_rate": 6.214167093404725e-05, + "loss": 1.1638, + "step": 451000 + }, + { + "epoch": 0.45, + "learning_rate": 6.198120702027724e-05, + "loss": 1.1351, + "step": 452000 + }, + { + "epoch": 0.45, + "learning_rate": 6.182061208194684e-05, + "loss": 0.9823, + "step": 453000 + }, + { + "epoch": 0.45, + "learning_rate": 6.165988787529652e-05, + "loss": 0.99, + "step": 454000 + }, + { + "epoch": 0.46, + "learning_rate": 6.149903615798047e-05, + "loss": 0.9387, + "step": 455000 + }, + { + "epoch": 0.46, + "learning_rate": 6.133821972874338e-05, + "loss": 0.9391, + "step": 456000 + }, + { + "epoch": 0.46, + "learning_rate": 6.117711839172833e-05, + "loss": 1.1115, + "step": 457000 + }, + { + "epoch": 0.46, + "learning_rate": 6.1016217391488694e-05, + "loss": 1.1276, + "step": 458000 + }, + { + "epoch": 0.46, + "learning_rate": 6.085487359441242e-05, + "loss": 1.1379, + "step": 459000 + }, + { + "epoch": 0.46, + "learning_rate": 6.069341109017761e-05, + "loss": 1.172, + "step": 460000 + }, + { + "epoch": 0.46, + "learning_rate": 6.053183164451236e-05, + "loss": 1.1493, + "step": 461000 + }, + { + "epoch": 0.46, + "learning_rate": 6.037013702442358e-05, + "loss": 0.9853, + "step": 462000 + }, + { + "epoch": 0.46, + "learning_rate": 6.020849086226153e-05, + "loss": 0.9542, + "step": 463000 + }, + { + "epoch": 0.46, + "learning_rate": 6.004657131011767e-05, + "loss": 0.9451, + "step": 464000 + }, + { + "epoch": 0.47, + "learning_rate": 5.988454189027957e-05, + "loss": 0.8593, + "step": 465000 + }, + { + "epoch": 0.47, + "learning_rate": 5.972256656559438e-05, + "loss": 0.8856, + "step": 466000 + }, + { + "epoch": 0.47, + "learning_rate": 5.956032283276984e-05, + "loss": 1.0615, + "step": 467000 + }, + { + "epoch": 0.47, + "learning_rate": 5.939813694970145e-05, + "loss": 0.9391, + "step": 468000 + }, + { + "epoch": 0.47, + "learning_rate": 5.923584849556318e-05, + "loss": 0.961, + "step": 469000 + }, + { + "epoch": 0.47, + "learning_rate": 5.9073296639853294e-05, + "loss": 0.9814, + "step": 470000 + }, + { + "epoch": 0.47, + "learning_rate": 5.8910645560025016e-05, + "loss": 1.0385, + "step": 471000 + }, + { + "epoch": 0.47, + "learning_rate": 5.874789703480451e-05, + "loss": 0.8768, + "step": 472000 + }, + { + "epoch": 0.47, + "learning_rate": 5.8585052843983566e-05, + "loss": 0.8435, + "step": 473000 + }, + { + "epoch": 0.47, + "learning_rate": 5.842211476840016e-05, + "loss": 0.8323, + "step": 474000 + }, + { + "epoch": 0.47, + "learning_rate": 5.825924766550955e-05, + "loss": 0.8381, + "step": 475000 + }, + { + "epoch": 0.48, + "learning_rate": 5.8096290421362876e-05, + "loss": 0.9095, + "step": 476000 + }, + { + "epoch": 0.48, + "learning_rate": 5.793308156198103e-05, + "loss": 1.0265, + "step": 477000 + }, + { + "epoch": 0.48, + "learning_rate": 5.776978594769021e-05, + "loss": 1.0133, + "step": 478000 + }, + { + "epoch": 0.48, + "learning_rate": 5.7606405364265115e-05, + "loss": 1.0417, + "step": 479000 + }, + { + "epoch": 0.48, + "learning_rate": 5.744310510313018e-05, + "loss": 1.0367, + "step": 480000 + }, + { + "epoch": 0.48, + "learning_rate": 5.727956002295942e-05, + "loss": 1.0063, + "step": 481000 + }, + { + "epoch": 0.48, + "learning_rate": 5.711593533468653e-05, + "loss": 0.8627, + "step": 482000 + }, + { + "epoch": 0.48, + "learning_rate": 5.6952232827684906e-05, + "loss": 0.9525, + "step": 483000 + }, + { + "epoch": 0.48, + "learning_rate": 5.678861810809461e-05, + "loss": 0.937, + "step": 484000 + }, + { + "epoch": 0.48, + "learning_rate": 5.662476540848262e-05, + "loss": 0.9029, + "step": 485000 + }, + { + "epoch": 0.49, + "learning_rate": 5.646116818290258e-05, + "loss": 1.0793, + "step": 486000 + }, + { + "epoch": 0.49, + "learning_rate": 5.629717252072332e-05, + "loss": 1.1195, + "step": 487000 + }, + { + "epoch": 0.49, + "learning_rate": 5.613327209200223e-05, + "loss": 1.1135, + "step": 488000 + }, + { + "epoch": 0.49, + "learning_rate": 5.596914056044104e-05, + "loss": 1.14, + "step": 489000 + }, + { + "epoch": 0.49, + "learning_rate": 5.580494375131501e-05, + "loss": 1.1599, + "step": 490000 + }, + { + "epoch": 0.49, + "learning_rate": 5.5640683460254154e-05, + "loss": 1.1177, + "step": 491000 + }, + { + "epoch": 0.49, + "learning_rate": 5.547636148358269e-05, + "loss": 0.9853, + "step": 492000 + }, + { + "epoch": 0.49, + "learning_rate": 5.531197961829942e-05, + "loss": 0.982, + "step": 493000 + }, + { + "epoch": 0.49, + "learning_rate": 5.514770413043232e-05, + "loss": 0.8807, + "step": 494000 + }, + { + "epoch": 0.49, + "learning_rate": 5.498320793691619e-05, + "loss": 0.8529, + "step": 495000 + }, + { + "epoch": 0.5, + "learning_rate": 5.481882182514718e-05, + "loss": 1.0643, + "step": 496000 + }, + { + "epoch": 0.5, + "learning_rate": 5.465421849180081e-05, + "loss": 1.0695, + "step": 497000 + }, + { + "epoch": 0.5, + "learning_rate": 5.4489564260666694e-05, + "loss": 1.0373, + "step": 498000 + }, + { + "epoch": 0.5, + "learning_rate": 5.432502565962998e-05, + "loss": 1.0638, + "step": 499000 + }, + { + "epoch": 0.5, + "learning_rate": 5.4160275081750255e-05, + "loss": 1.0865, + "step": 500000 + }, + { + "epoch": 0.5, + "eval_accuracy": 0.8242477038501768, + "eval_loss": 0.782859206199646, + "eval_runtime": 10.6001, + "eval_samples_per_second": 471.696, + "eval_steps_per_second": 0.943, + "step": 500000 + }, + { + "epoch": 0.5, + "learning_rate": 5.39956438259683e-05, + "loss": 0.8685, + "step": 501000 + }, + { + "epoch": 0.5, + "learning_rate": 5.3830804100862085e-05, + "loss": 0.8195, + "step": 502000 + }, + { + "epoch": 0.5, + "learning_rate": 5.3666252286543196e-05, + "loss": 0.7759, + "step": 503000 + }, + { + "epoch": 0.5, + "learning_rate": 5.350133065681624e-05, + "loss": 0.7921, + "step": 504000 + }, + { + "epoch": 0.51, + "learning_rate": 5.3336370737098185e-05, + "loss": 0.8275, + "step": 505000 + }, + { + "epoch": 0.51, + "learning_rate": 5.3171374331364276e-05, + "loss": 0.9585, + "step": 506000 + }, + { + "epoch": 0.51, + "learning_rate": 5.300650829179897e-05, + "loss": 0.9638, + "step": 507000 + }, + { + "epoch": 0.51, + "learning_rate": 5.284144435951072e-05, + "loss": 1.0516, + "step": 508000 + }, + { + "epoch": 0.51, + "learning_rate": 5.2676349353642174e-05, + "loss": 1.0423, + "step": 509000 + }, + { + "epoch": 0.51, + "learning_rate": 5.251139021793834e-05, + "loss": 1.0497, + "step": 510000 + }, + { + "epoch": 0.51, + "learning_rate": 5.234623850814717e-05, + "loss": 0.9067, + "step": 511000 + }, + { + "epoch": 0.51, + "learning_rate": 5.218122632985052e-05, + "loss": 0.8785, + "step": 512000 + }, + { + "epoch": 0.51, + "learning_rate": 5.201602513318512e-05, + "loss": 0.8185, + "step": 513000 + }, + { + "epoch": 0.51, + "learning_rate": 5.185080188959187e-05, + "loss": 0.752, + "step": 514000 + }, + { + "epoch": 0.52, + "learning_rate": 5.168572365891783e-05, + "loss": 0.9095, + "step": 515000 + }, + { + "epoch": 0.52, + "learning_rate": 5.152046175978524e-05, + "loss": 1.0988, + "step": 516000 + }, + { + "epoch": 0.52, + "learning_rate": 5.1355348519357e-05, + "loss": 1.0842, + "step": 517000 + }, + { + "epoch": 0.52, + "learning_rate": 5.11900551865487e-05, + "loss": 1.125, + "step": 518000 + }, + { + "epoch": 0.52, + "learning_rate": 5.102491415173354e-05, + "loss": 1.1397, + "step": 519000 + }, + { + "epoch": 0.52, + "learning_rate": 5.08595966084867e-05, + "loss": 1.1343, + "step": 520000 + }, + { + "epoch": 0.52, + "learning_rate": 5.069443499586649e-05, + "loss": 0.9646, + "step": 521000 + }, + { + "epoch": 0.52, + "learning_rate": 5.052910046647634e-05, + "loss": 0.9628, + "step": 522000 + }, + { + "epoch": 0.52, + "learning_rate": 5.036376015092827e-05, + "loss": 0.9286, + "step": 523000 + }, + { + "epoch": 0.52, + "learning_rate": 5.0198415857357464e-05, + "loss": 0.9084, + "step": 524000 + }, + { + "epoch": 0.53, + "learning_rate": 5.003323474088806e-05, + "loss": 1.0839, + "step": 525000 + }, + { + "epoch": 0.53, + "learning_rate": 4.986805326169539e-05, + "loss": 1.1188, + "step": 526000 + }, + { + "epoch": 0.53, + "learning_rate": 4.970270787850604e-05, + "loss": 1.0571, + "step": 527000 + }, + { + "epoch": 0.53, + "learning_rate": 4.9537365746455695e-05, + "loss": 1.0487, + "step": 528000 + }, + { + "epoch": 0.53, + "learning_rate": 4.937202867369945e-05, + "loss": 1.1107, + "step": 529000 + }, + { + "epoch": 0.53, + "learning_rate": 4.920686379451033e-05, + "loss": 1.0362, + "step": 530000 + }, + { + "epoch": 0.53, + "learning_rate": 4.904170757159314e-05, + "loss": 0.9012, + "step": 531000 + }, + { + "epoch": 0.53, + "learning_rate": 4.887639650222643e-05, + "loss": 0.8745, + "step": 532000 + }, + { + "epoch": 0.53, + "learning_rate": 4.8711097720407546e-05, + "loss": 0.8339, + "step": 533000 + }, + { + "epoch": 0.53, + "learning_rate": 4.854581303381751e-05, + "loss": 0.7654, + "step": 534000 + }, + { + "epoch": 0.54, + "learning_rate": 4.838070951022198e-05, + "loss": 0.9112, + "step": 535000 + }, + { + "epoch": 0.54, + "learning_rate": 4.821545841788341e-05, + "loss": 0.8959, + "step": 536000 + }, + { + "epoch": 0.54, + "learning_rate": 4.805039206223316e-05, + "loss": 0.986, + "step": 537000 + }, + { + "epoch": 0.54, + "learning_rate": 4.78851817855374e-05, + "loss": 0.9942, + "step": 538000 + }, + { + "epoch": 0.54, + "learning_rate": 4.772015981115084e-05, + "loss": 0.9933, + "step": 539000 + }, + { + "epoch": 0.54, + "learning_rate": 4.7554997569705646e-05, + "loss": 0.8891, + "step": 540000 + }, + { + "epoch": 0.54, + "learning_rate": 4.7389862066415265e-05, + "loss": 0.9105, + "step": 541000 + }, + { + "epoch": 0.54, + "learning_rate": 4.722475510717516e-05, + "loss": 0.8341, + "step": 542000 + }, + { + "epoch": 0.54, + "learning_rate": 4.7059843558417655e-05, + "loss": 0.8087, + "step": 543000 + }, + { + "epoch": 0.54, + "learning_rate": 4.689479907063946e-05, + "loss": 0.8851, + "step": 544000 + }, + { + "epoch": 0.55, + "learning_rate": 4.672995353380826e-05, + "loss": 0.9986, + "step": 545000 + }, + { + "epoch": 0.55, + "learning_rate": 4.656497872985476e-05, + "loss": 0.9483, + "step": 546000 + }, + { + "epoch": 0.55, + "learning_rate": 4.6400371326533076e-05, + "loss": 0.9711, + "step": 547000 + }, + { + "epoch": 0.55, + "learning_rate": 4.623547337545436e-05, + "loss": 1.0405, + "step": 548000 + }, + { + "epoch": 0.55, + "learning_rate": 4.607061659263568e-05, + "loss": 1.113, + "step": 549000 + }, + { + "epoch": 0.55, + "learning_rate": 4.5905802780924407e-05, + "loss": 0.9616, + "step": 550000 + }, + { + "epoch": 0.55, + "eval_accuracy": 0.822435579595823, + "eval_loss": 0.7894766926765442, + "eval_runtime": 10.8513, + "eval_samples_per_second": 460.775, + "eval_steps_per_second": 0.922, + "step": 550000 + }, + { + "epoch": 0.55, + "learning_rate": 4.574103374269796e-05, + "loss": 0.9594, + "step": 551000 + }, + { + "epoch": 0.55, + "learning_rate": 4.557647597844301e-05, + "loss": 0.9291, + "step": 552000 + }, + { + "epoch": 0.55, + "learning_rate": 4.541180184306395e-05, + "loss": 0.8842, + "step": 553000 + }, + { + "epoch": 0.55, + "learning_rate": 4.5247507080131874e-05, + "loss": 1.0254, + "step": 554000 + }, + { + "epoch": 0.56, + "learning_rate": 4.508293499090543e-05, + "loss": 1.1074, + "step": 555000 + }, + { + "epoch": 0.56, + "learning_rate": 4.4918416673914135e-05, + "loss": 1.0816, + "step": 556000 + }, + { + "epoch": 0.56, + "learning_rate": 4.4754118362693004e-05, + "loss": 1.1233, + "step": 557000 + }, + { + "epoch": 0.56, + "learning_rate": 4.458971292873412e-05, + "loss": 1.1443, + "step": 558000 + }, + { + "epoch": 0.56, + "learning_rate": 4.442536666080782e-05, + "loss": 1.1356, + "step": 559000 + }, + { + "epoch": 0.56, + "learning_rate": 4.426124561043402e-05, + "loss": 0.88, + "step": 560000 + }, + { + "epoch": 0.56, + "learning_rate": 4.409702300204254e-05, + "loss": 0.8733, + "step": 561000 + }, + { + "epoch": 0.56, + "learning_rate": 4.393286494766179e-05, + "loss": 0.8834, + "step": 562000 + }, + { + "epoch": 0.56, + "learning_rate": 4.376877324249791e-05, + "loss": 0.8683, + "step": 563000 + }, + { + "epoch": 0.56, + "learning_rate": 4.3604913669958144e-05, + "loss": 0.9784, + "step": 564000 + }, + { + "epoch": 0.56, + "learning_rate": 4.344095997509148e-05, + "loss": 1.0091, + "step": 565000 + }, + { + "epoch": 0.57, + "learning_rate": 4.32770780088357e-05, + "loss": 0.994, + "step": 566000 + }, + { + "epoch": 0.57, + "learning_rate": 4.3113433334503214e-05, + "loss": 0.9519, + "step": 567000 + }, + { + "epoch": 0.57, + "learning_rate": 4.29497001250192e-05, + "loss": 0.9524, + "step": 568000 + }, + { + "epoch": 0.57, + "learning_rate": 4.278620763348594e-05, + "loss": 0.8762, + "step": 569000 + }, + { + "epoch": 0.57, + "learning_rate": 4.2622630335834316e-05, + "loss": 0.8521, + "step": 570000 + }, + { + "epoch": 0.57, + "learning_rate": 4.245929717164161e-05, + "loss": 0.7913, + "step": 571000 + }, + { + "epoch": 0.57, + "learning_rate": 4.229588293406972e-05, + "loss": 0.7513, + "step": 572000 + }, + { + "epoch": 0.57, + "learning_rate": 4.2132552947487734e-05, + "loss": 0.8198, + "step": 573000 + }, + { + "epoch": 0.57, + "learning_rate": 4.196930899804627e-05, + "loss": 1.006, + "step": 574000 + }, + { + "epoch": 0.57, + "learning_rate": 4.1806479094376326e-05, + "loss": 0.9535, + "step": 575000 + }, + { + "epoch": 0.58, + "learning_rate": 4.1643412392891206e-05, + "loss": 1.0172, + "step": 576000 + }, + { + "epoch": 0.58, + "learning_rate": 4.1480600006783755e-05, + "loss": 1.0338, + "step": 577000 + }, + { + "epoch": 0.58, + "learning_rate": 4.131771776611856e-05, + "loss": 1.0166, + "step": 578000 + }, + { + "epoch": 0.58, + "learning_rate": 4.115493047350083e-05, + "loss": 0.8639, + "step": 579000 + }, + { + "epoch": 0.58, + "learning_rate": 4.099223990914634e-05, + "loss": 0.8269, + "step": 580000 + }, + { + "epoch": 0.58, + "learning_rate": 4.082981039447362e-05, + "loss": 0.8496, + "step": 581000 + }, + { + "epoch": 0.58, + "learning_rate": 4.0667318521868897e-05, + "loss": 0.8634, + "step": 582000 + }, + { + "epoch": 0.58, + "learning_rate": 4.0504928709973586e-05, + "loss": 0.9634, + "step": 583000 + }, + { + "epoch": 0.58, + "learning_rate": 4.034280496817482e-05, + "loss": 1.1009, + "step": 584000 + }, + { + "epoch": 0.58, + "learning_rate": 4.018062449767229e-05, + "loss": 1.0633, + "step": 585000 + }, + { + "epoch": 0.59, + "learning_rate": 4.001871342914655e-05, + "loss": 1.1073, + "step": 586000 + }, + { + "epoch": 0.59, + "learning_rate": 3.98567493872435e-05, + "loss": 1.1197, + "step": 587000 + }, + { + "epoch": 0.59, + "learning_rate": 3.9695219864658135e-05, + "loss": 1.121, + "step": 588000 + }, + { + "epoch": 0.59, + "learning_rate": 3.953364089929742e-05, + "loss": 0.9561, + "step": 589000 + }, + { + "epoch": 0.59, + "learning_rate": 3.937201459501688e-05, + "loss": 0.9589, + "step": 590000 + }, + { + "epoch": 0.59, + "learning_rate": 3.921050451668199e-05, + "loss": 0.9277, + "step": 591000 + }, + { + "epoch": 0.59, + "learning_rate": 3.9049112430541065e-05, + "loss": 0.8929, + "step": 592000 + }, + { + "epoch": 0.59, + "learning_rate": 3.888800131347529e-05, + "loss": 0.9633, + "step": 593000 + }, + { + "epoch": 0.59, + "learning_rate": 3.872685038288536e-05, + "loss": 1.0104, + "step": 594000 + }, + { + "epoch": 0.59, + "learning_rate": 3.856582273365353e-05, + "loss": 1.0472, + "step": 595000 + }, + { + "epoch": 0.6, + "learning_rate": 3.840492012675236e-05, + "loss": 1.0702, + "step": 596000 + }, + { + "epoch": 0.6, + "learning_rate": 3.8244305033669075e-05, + "loss": 1.0294, + "step": 597000 + }, + { + "epoch": 0.6, + "learning_rate": 3.808365765942003e-05, + "loss": 1.0254, + "step": 598000 + }, + { + "epoch": 0.6, + "learning_rate": 3.7923140600381655e-05, + "loss": 0.8297, + "step": 599000 + }, + { + "epoch": 0.6, + "learning_rate": 3.776275561194279e-05, + "loss": 0.7727, + "step": 600000 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.8285096268267774, + "eval_loss": 0.7584890127182007, + "eval_runtime": 11.327, + "eval_samples_per_second": 441.425, + "eval_steps_per_second": 0.883, + "step": 600000 + }, + { + "epoch": 0.6, + "learning_rate": 3.760250444804797e-05, + "loss": 0.7138, + "step": 601000 + }, + { + "epoch": 0.6, + "learning_rate": 3.744254890846151e-05, + "loss": 0.7417, + "step": 602000 + }, + { + "epoch": 0.6, + "learning_rate": 3.7282730420633285e-05, + "loss": 0.929, + "step": 603000 + }, + { + "epoch": 0.6, + "learning_rate": 3.712289095940718e-05, + "loss": 0.9185, + "step": 604000 + }, + { + "epoch": 0.6, + "learning_rate": 3.696319232018241e-05, + "loss": 0.9308, + "step": 605000 + }, + { + "epoch": 0.61, + "learning_rate": 3.6803636249397785e-05, + "loss": 1.0135, + "step": 606000 + }, + { + "epoch": 0.61, + "learning_rate": 3.664438383102547e-05, + "loss": 1.0432, + "step": 607000 + }, + { + "epoch": 0.61, + "learning_rate": 3.648511798325486e-05, + "loss": 0.9097, + "step": 608000 + }, + { + "epoch": 0.61, + "learning_rate": 3.632599993206869e-05, + "loss": 0.8698, + "step": 609000 + }, + { + "epoch": 0.61, + "learning_rate": 3.616719031079871e-05, + "loss": 0.8344, + "step": 610000 + }, + { + "epoch": 1.0, + "learning_rate": 3.600837291927178e-05, + "loss": 0.8786, + "step": 611000 + }, + { + "epoch": 1.0, + "learning_rate": 3.584970853793694e-05, + "loss": 0.9996, + "step": 612000 + }, + { + "epoch": 1.0, + "learning_rate": 3.569135733368609e-05, + "loss": 1.0586, + "step": 613000 + }, + { + "epoch": 1.0, + "learning_rate": 3.553300401908391e-05, + "loss": 1.0897, + "step": 614000 + }, + { + "epoch": 1.0, + "learning_rate": 3.5374967028736584e-05, + "loss": 0.9494, + "step": 615000 + }, + { + "epoch": 1.01, + "learning_rate": 3.521693170082889e-05, + "loss": 0.856, + "step": 616000 + }, + { + "epoch": 1.01, + "learning_rate": 3.505905803818765e-05, + "loss": 0.8594, + "step": 617000 + }, + { + "epoch": 1.01, + "learning_rate": 3.490150539537649e-05, + "loss": 0.7276, + "step": 618000 + }, + { + "epoch": 1.01, + "learning_rate": 3.474396007494727e-05, + "loss": 0.7412, + "step": 619000 + }, + { + "epoch": 1.01, + "learning_rate": 3.45867388867009e-05, + "loss": 0.8891, + "step": 620000 + }, + { + "epoch": 1.01, + "learning_rate": 3.442952879313661e-05, + "loss": 0.9131, + "step": 621000 + }, + { + "epoch": 1.01, + "learning_rate": 3.427264592994348e-05, + "loss": 0.9492, + "step": 622000 + }, + { + "epoch": 1.01, + "learning_rate": 3.411577793324775e-05, + "loss": 0.9621, + "step": 623000 + }, + { + "epoch": 1.01, + "learning_rate": 3.395924025081174e-05, + "loss": 0.975, + "step": 624000 + }, + { + "epoch": 1.01, + "learning_rate": 3.3802721206038737e-05, + "loss": 0.8927, + "step": 625000 + }, + { + "epoch": 1.02, + "learning_rate": 3.36463792921116e-05, + "loss": 0.8459, + "step": 626000 + }, + { + "epoch": 1.02, + "learning_rate": 3.3490372291934334e-05, + "loss": 0.8157, + "step": 627000 + }, + { + "epoch": 1.02, + "learning_rate": 3.333438958553279e-05, + "loss": 0.7887, + "step": 628000 + }, + { + "epoch": 1.02, + "learning_rate": 3.3178744840428254e-05, + "loss": 0.8727, + "step": 629000 + }, + { + "epoch": 1.02, + "learning_rate": 3.302312815791971e-05, + "loss": 0.9689, + "step": 630000 + }, + { + "epoch": 1.02, + "learning_rate": 3.2867697131761696e-05, + "loss": 0.9148, + "step": 631000 + }, + { + "epoch": 1.02, + "learning_rate": 3.2712608611243435e-05, + "loss": 0.9737, + "step": 632000 + }, + { + "epoch": 1.02, + "learning_rate": 3.255770876495283e-05, + "loss": 1.0775, + "step": 633000 + }, + { + "epoch": 1.02, + "learning_rate": 3.240284451505519e-05, + "loss": 1.1036, + "step": 634000 + }, + { + "epoch": 1.02, + "learning_rate": 3.2248172704830614e-05, + "loss": 0.9381, + "step": 635000 + }, + { + "epoch": 1.03, + "learning_rate": 3.209369502574527e-05, + "loss": 0.9412, + "step": 636000 + }, + { + "epoch": 1.03, + "learning_rate": 3.1939413167142243e-05, + "loss": 0.9021, + "step": 637000 + }, + { + "epoch": 1.03, + "learning_rate": 3.178532881622327e-05, + "loss": 0.8694, + "step": 638000 + }, + { + "epoch": 1.03, + "learning_rate": 3.163159744313126e-05, + "loss": 1.0402, + "step": 639000 + }, + { + "epoch": 1.03, + "learning_rate": 3.147791295881192e-05, + "loss": 1.084, + "step": 640000 + }, + { + "epoch": 1.03, + "learning_rate": 3.1324584409263834e-05, + "loss": 1.0841, + "step": 641000 + }, + { + "epoch": 1.03, + "learning_rate": 3.117130650748039e-05, + "loss": 1.1045, + "step": 642000 + }, + { + "epoch": 1.03, + "learning_rate": 3.101838748185837e-05, + "loss": 1.1229, + "step": 643000 + }, + { + "epoch": 1.03, + "learning_rate": 3.0865522860779585e-05, + "loss": 1.0369, + "step": 644000 + }, + { + "epoch": 1.03, + "learning_rate": 3.071286749127718e-05, + "loss": 0.8539, + "step": 645000 + }, + { + "epoch": 1.04, + "learning_rate": 3.056042304276582e-05, + "loss": 0.8873, + "step": 646000 + }, + { + "epoch": 1.04, + "learning_rate": 3.0408343307471943e-05, + "loss": 0.8589, + "step": 647000 + }, + { + "epoch": 1.04, + "learning_rate": 3.025632548485837e-05, + "loss": 0.8197, + "step": 648000 + }, + { + "epoch": 1.04, + "learning_rate": 3.010467526941344e-05, + "loss": 0.9677, + "step": 649000 + }, + { + "epoch": 1.04, + "learning_rate": 2.9953090715801634e-05, + "loss": 0.9871, + "step": 650000 + }, + { + "epoch": 1.04, + "eval_accuracy": 0.831968594961707, + "eval_loss": 0.7390721440315247, + "eval_runtime": 10.7168, + "eval_samples_per_second": 466.556, + "eval_steps_per_second": 0.933, + "step": 650000 + }, + { + "epoch": 1.04, + "learning_rate": 2.980187664724534e-05, + "loss": 0.9091, + "step": 651000 + }, + { + "epoch": 1.04, + "learning_rate": 2.965073198680562e-05, + "loss": 0.9263, + "step": 652000 + }, + { + "epoch": 1.04, + "learning_rate": 2.9499960673118322e-05, + "loss": 0.9273, + "step": 653000 + }, + { + "epoch": 1.04, + "learning_rate": 2.934926251079786e-05, + "loss": 0.8529, + "step": 654000 + }, + { + "epoch": 1.04, + "learning_rate": 2.9198790181634074e-05, + "loss": 0.8239, + "step": 655000 + }, + { + "epoch": 1.05, + "learning_rate": 2.9048695461845966e-05, + "loss": 0.7665, + "step": 656000 + }, + { + "epoch": 1.05, + "learning_rate": 2.8898679503189996e-05, + "loss": 0.7491, + "step": 657000 + }, + { + "epoch": 1.05, + "learning_rate": 2.8749043974581896e-05, + "loss": 0.8524, + "step": 658000 + }, + { + "epoch": 1.05, + "learning_rate": 2.859949094205726e-05, + "loss": 0.9649, + "step": 659000 + }, + { + "epoch": 1.05, + "learning_rate": 2.845017194207104e-05, + "loss": 0.9405, + "step": 660000 + }, + { + "epoch": 1.05, + "learning_rate": 2.83012375726286e-05, + "loss": 0.9965, + "step": 661000 + }, + { + "epoch": 1.05, + "learning_rate": 2.8152391295818637e-05, + "loss": 1.0081, + "step": 662000 + }, + { + "epoch": 1.05, + "learning_rate": 2.800393242842937e-05, + "loss": 0.981, + "step": 663000 + }, + { + "epoch": 1.05, + "learning_rate": 2.7855565379304605e-05, + "loss": 0.8153, + "step": 664000 + }, + { + "epoch": 1.05, + "learning_rate": 2.7707588501547865e-05, + "loss": 0.8235, + "step": 665000 + }, + { + "epoch": 1.06, + "learning_rate": 2.7559707163675296e-05, + "loss": 0.8797, + "step": 666000 + }, + { + "epoch": 1.06, + "learning_rate": 2.7412071229249947e-05, + "loss": 0.8552, + "step": 667000 + }, + { + "epoch": 1.06, + "learning_rate": 2.7264829577788996e-05, + "loss": 0.978, + "step": 668000 + }, + { + "epoch": 1.06, + "learning_rate": 2.711768904169148e-05, + "loss": 1.0823, + "step": 669000 + }, + { + "epoch": 1.06, + "learning_rate": 2.6970945507650934e-05, + "loss": 1.0495, + "step": 670000 + }, + { + "epoch": 1.06, + "learning_rate": 2.6824306799850207e-05, + "loss": 1.0896, + "step": 671000 + }, + { + "epoch": 1.06, + "learning_rate": 2.6678067795850213e-05, + "loss": 1.1019, + "step": 672000 + }, + { + "epoch": 1.06, + "learning_rate": 2.653193732438558e-05, + "loss": 1.0991, + "step": 673000 + }, + { + "epoch": 1.06, + "learning_rate": 2.6386209240998583e-05, + "loss": 0.9356, + "step": 674000 + }, + { + "epoch": 1.06, + "learning_rate": 2.624059339169965e-05, + "loss": 0.942, + "step": 675000 + }, + { + "epoch": 1.07, + "learning_rate": 2.6095237371464796e-05, + "loss": 0.9061, + "step": 676000 + }, + { + "epoch": 1.07, + "learning_rate": 2.5950287733378777e-05, + "loss": 0.8134, + "step": 677000 + }, + { + "epoch": 1.07, + "learning_rate": 2.580545587338671e-05, + "loss": 0.9557, + "step": 678000 + }, + { + "epoch": 1.07, + "learning_rate": 2.5661033035637218e-05, + "loss": 1.0279, + "step": 679000 + }, + { + "epoch": 1.07, + "learning_rate": 2.551673166496884e-05, + "loss": 1.0323, + "step": 680000 + }, + { + "epoch": 1.07, + "learning_rate": 2.537284193876279e-05, + "loss": 1.003, + "step": 681000 + }, + { + "epoch": 1.07, + "learning_rate": 2.5229077363316923e-05, + "loss": 1.0201, + "step": 682000 + }, + { + "epoch": 1.07, + "learning_rate": 2.5085727036564444e-05, + "loss": 0.9572, + "step": 683000 + }, + { + "epoch": 1.07, + "learning_rate": 2.494250553878222e-05, + "loss": 0.7758, + "step": 684000 + }, + { + "epoch": 1.07, + "learning_rate": 2.4799558065744872e-05, + "loss": 0.7355, + "step": 685000 + }, + { + "epoch": 1.08, + "learning_rate": 2.465688618070303e-05, + "loss": 0.6992, + "step": 686000 + }, + { + "epoch": 1.08, + "learning_rate": 2.4514633699676327e-05, + "loss": 0.7698, + "step": 687000 + }, + { + "epoch": 1.08, + "learning_rate": 2.437251738882307e-05, + "loss": 0.9104, + "step": 688000 + }, + { + "epoch": 1.08, + "learning_rate": 2.4230823031561167e-05, + "loss": 0.8912, + "step": 689000 + }, + { + "epoch": 1.08, + "learning_rate": 2.408926850530927e-05, + "loss": 0.9452, + "step": 690000 + }, + { + "epoch": 1.08, + "learning_rate": 2.394813846378582e-05, + "loss": 1.0174, + "step": 691000 + }, + { + "epoch": 1.08, + "learning_rate": 2.380715190799833e-05, + "loss": 0.9929, + "step": 692000 + }, + { + "epoch": 1.08, + "learning_rate": 2.3666592349526597e-05, + "loss": 0.8779, + "step": 693000 + }, + { + "epoch": 1.08, + "learning_rate": 2.3526179925246578e-05, + "loss": 0.8655, + "step": 694000 + }, + { + "epoch": 1.08, + "learning_rate": 2.3386057014421453e-05, + "loss": 0.8112, + "step": 695000 + }, + { + "epoch": 1.09, + "learning_rate": 2.3246364835391272e-05, + "loss": 0.7425, + "step": 696000 + }, + { + "epoch": 1.09, + "learning_rate": 2.3106825252040365e-05, + "loss": 0.7938, + "step": 697000 + }, + { + "epoch": 1.09, + "learning_rate": 2.296771886621397e-05, + "loss": 0.9602, + "step": 698000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2828768708148356e-05, + "loss": 1.0117, + "step": 699000 + }, + { + "epoch": 1.09, + "learning_rate": 2.269011569031495e-05, + "loss": 1.0679, + "step": 700000 + }, + { + "epoch": 1.09, + "eval_accuracy": 0.8310942232767082, + "eval_loss": 0.7435500025749207, + "eval_runtime": 12.3346, + "eval_samples_per_second": 405.365, + "eval_steps_per_second": 0.811, + "step": 700000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2551899533679438e-05, + "loss": 1.0913, + "step": 701000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2413845040981702e-05, + "loss": 1.0949, + "step": 702000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2276229827680116e-05, + "loss": 0.9394, + "step": 703000 + }, + { + "epoch": 1.09, + "learning_rate": 2.213891719102571e-05, + "loss": 0.9283, + "step": 704000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2001771636780773e-05, + "loss": 0.8962, + "step": 705000 + }, + { + "epoch": 1.1, + "learning_rate": 2.1864932266675498e-05, + "loss": 0.8581, + "step": 706000 + }, + { + "epoch": 1.1, + "learning_rate": 2.1728400577163317e-05, + "loss": 0.9983, + "step": 707000 + }, + { + "epoch": 1.1, + "learning_rate": 2.1592314128920388e-05, + "loss": 1.0883, + "step": 708000 + }, + { + "epoch": 1.1, + "learning_rate": 2.1456401965073002e-05, + "loss": 1.0622, + "step": 709000 + }, + { + "epoch": 1.1, + "learning_rate": 2.1320937393044398e-05, + "loss": 1.0325, + "step": 710000 + }, + { + "epoch": 1.1, + "learning_rate": 2.118565069415642e-05, + "loss": 1.0107, + "step": 711000 + }, + { + "epoch": 1.1, + "learning_rate": 2.1050813918077206e-05, + "loss": 1.0555, + "step": 712000 + }, + { + "epoch": 1.1, + "learning_rate": 2.0916158596110753e-05, + "loss": 0.8862, + "step": 713000 + }, + { + "epoch": 1.1, + "learning_rate": 2.0781821330374222e-05, + "loss": 0.8341, + "step": 714000 + }, + { + "epoch": 1.1, + "learning_rate": 2.0647937447607897e-05, + "loss": 0.8047, + "step": 715000 + }, + { + "epoch": 1.11, + "learning_rate": 2.0514240376387622e-05, + "loss": 0.7639, + "step": 716000 + }, + { + "epoch": 1.11, + "learning_rate": 2.0380998969779935e-05, + "loss": 0.8598, + "step": 717000 + }, + { + "epoch": 1.11, + "learning_rate": 2.0247947935572087e-05, + "loss": 0.8621, + "step": 718000 + }, + { + "epoch": 1.11, + "learning_rate": 2.0115354827706882e-05, + "loss": 0.8859, + "step": 719000 + }, + { + "epoch": 1.11, + "learning_rate": 1.9982955644776074e-05, + "loss": 0.9538, + "step": 720000 + }, + { + "epoch": 1.11, + "learning_rate": 1.9851016629906853e-05, + "loss": 0.9508, + "step": 721000 + }, + { + "epoch": 1.11, + "learning_rate": 1.9719275084032178e-05, + "loss": 0.8639, + "step": 722000 + }, + { + "epoch": 1.11, + "learning_rate": 1.958786468331379e-05, + "loss": 0.8306, + "step": 723000 + }, + { + "epoch": 1.11, + "learning_rate": 1.9456917776051037e-05, + "loss": 0.8304, + "step": 724000 + }, + { + "epoch": 1.11, + "learning_rate": 1.932617363852691e-05, + "loss": 0.7599, + "step": 725000 + }, + { + "epoch": 1.12, + "learning_rate": 1.9195895185717727e-05, + "loss": 0.8071, + "step": 726000 + }, + { + "epoch": 1.12, + "learning_rate": 1.906582302484229e-05, + "loss": 0.9583, + "step": 727000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8936089155170195e-05, + "loss": 0.9304, + "step": 728000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8806824219453507e-05, + "loss": 0.9371, + "step": 729000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8677770842888893e-05, + "loss": 0.9491, + "step": 730000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8549188540480782e-05, + "loss": 1.0003, + "step": 731000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8420821296592628e-05, + "loss": 0.9365, + "step": 732000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8292799397543353e-05, + "loss": 0.9281, + "step": 733000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8165251744849992e-05, + "loss": 0.8999, + "step": 734000 + }, + { + "epoch": 1.12, + "learning_rate": 1.8037924382930186e-05, + "loss": 0.8513, + "step": 735000 + }, + { + "epoch": 1.13, + "learning_rate": 1.7911073355924523e-05, + "loss": 0.9405, + "step": 736000 + }, + { + "epoch": 1.13, + "learning_rate": 1.7784446095277167e-05, + "loss": 1.0675, + "step": 737000 + }, + { + "epoch": 1.13, + "learning_rate": 1.7658297237286585e-05, + "loss": 1.0324, + "step": 738000 + }, + { + "epoch": 1.13, + "learning_rate": 1.7532375611460606e-05, + "loss": 1.0777, + "step": 739000 + }, + { + "epoch": 1.13, + "learning_rate": 1.740693443512806e-05, + "loss": 1.1039, + "step": 740000 + }, + { + "epoch": 1.13, + "learning_rate": 1.7281723946836492e-05, + "loss": 1.1023, + "step": 741000 + }, + { + "epoch": 1.13, + "learning_rate": 1.715699593387959e-05, + "loss": 0.9438, + "step": 742000 + }, + { + "epoch": 1.13, + "learning_rate": 1.7032502054759063e-05, + "loss": 0.8537, + "step": 743000 + }, + { + "epoch": 1.13, + "learning_rate": 1.6908492655730533e-05, + "loss": 0.8055, + "step": 744000 + }, + { + "epoch": 1.13, + "learning_rate": 1.6784720826102262e-05, + "loss": 0.8227, + "step": 745000 + }, + { + "epoch": 1.14, + "learning_rate": 1.666143546015176e-05, + "loss": 0.9655, + "step": 746000 + }, + { + "epoch": 1.14, + "learning_rate": 1.6538391088783723e-05, + "loss": 0.9601, + "step": 747000 + }, + { + "epoch": 1.14, + "learning_rate": 1.6415835143421003e-05, + "loss": 0.9641, + "step": 748000 + }, + { + "epoch": 1.14, + "learning_rate": 1.629352360729162e-05, + "loss": 0.9631, + "step": 749000 + }, + { + "epoch": 1.14, + "learning_rate": 1.6171580679787825e-05, + "loss": 0.9203, + "step": 750000 + }, + { + "epoch": 1.14, + "eval_accuracy": 0.8355373765019023, + "eval_loss": 0.7186969518661499, + "eval_runtime": 10.9844, + "eval_samples_per_second": 455.192, + "eval_steps_per_second": 0.91, + "step": 750000 + }, + { + "epoch": 1.14, + "learning_rate": 1.6050007694457925e-05, + "loss": 0.8906, + "step": 751000 + }, + { + "epoch": 1.14, + "learning_rate": 1.592892699662655e-05, + "loss": 0.7487, + "step": 752000 + }, + { + "epoch": 1.14, + "learning_rate": 1.5808097506834524e-05, + "loss": 0.7857, + "step": 753000 + }, + { + "epoch": 1.14, + "learning_rate": 1.5687641934210118e-05, + "loss": 0.7229, + "step": 754000 + }, + { + "epoch": 1.14, + "learning_rate": 1.556768148850735e-05, + "loss": 0.7239, + "step": 755000 + }, + { + "epoch": 1.15, + "learning_rate": 1.544797732076107e-05, + "loss": 0.923, + "step": 756000 + }, + { + "epoch": 1.15, + "learning_rate": 1.532877014553799e-05, + "loss": 0.9589, + "step": 757000 + }, + { + "epoch": 1.15, + "learning_rate": 1.5209822613682983e-05, + "loss": 0.9368, + "step": 758000 + }, + { + "epoch": 1.15, + "learning_rate": 1.5091255541633964e-05, + "loss": 0.9942, + "step": 759000 + }, + { + "epoch": 1.15, + "learning_rate": 1.4973188220219254e-05, + "loss": 0.996, + "step": 760000 + }, + { + "epoch": 1.15, + "learning_rate": 1.4855385569805891e-05, + "loss": 0.898, + "step": 761000 + }, + { + "epoch": 1.15, + "learning_rate": 1.4738084481176312e-05, + "loss": 0.8023, + "step": 762000 + }, + { + "epoch": 1.15, + "learning_rate": 1.4621051400316382e-05, + "loss": 0.7635, + "step": 763000 + }, + { + "epoch": 1.15, + "learning_rate": 1.4504405217970129e-05, + "loss": 0.7792, + "step": 764000 + }, + { + "epoch": 1.15, + "learning_rate": 1.4388263273453235e-05, + "loss": 0.8774, + "step": 765000 + }, + { + "epoch": 1.16, + "learning_rate": 1.4272509994685329e-05, + "loss": 1.0395, + "step": 766000 + }, + { + "epoch": 1.16, + "learning_rate": 1.4157031361942913e-05, + "loss": 1.0266, + "step": 767000 + }, + { + "epoch": 1.16, + "learning_rate": 1.4041944702162985e-05, + "loss": 1.0627, + "step": 768000 + }, + { + "epoch": 1.16, + "learning_rate": 1.3927251273914792e-05, + "loss": 1.083, + "step": 769000 + }, + { + "epoch": 1.16, + "learning_rate": 1.3813066432947708e-05, + "loss": 1.0811, + "step": 770000 + }, + { + "epoch": 1.16, + "learning_rate": 1.3699162829897188e-05, + "loss": 0.9505, + "step": 771000 + }, + { + "epoch": 1.16, + "learning_rate": 1.358576951490385e-05, + "loss": 0.9315, + "step": 772000 + }, + { + "epoch": 1.16, + "learning_rate": 1.3472660714582335e-05, + "loss": 0.9083, + "step": 773000 + }, + { + "epoch": 1.16, + "learning_rate": 1.3360063880794788e-05, + "loss": 0.8656, + "step": 774000 + }, + { + "epoch": 1.16, + "learning_rate": 1.3247754826001119e-05, + "loss": 0.9627, + "step": 775000 + }, + { + "epoch": 1.17, + "learning_rate": 1.3135847687872443e-05, + "loss": 0.9883, + "step": 776000 + }, + { + "epoch": 1.17, + "learning_rate": 1.3024454992430079e-05, + "loss": 0.9516, + "step": 777000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2913354949650841e-05, + "loss": 1.044, + "step": 778000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2802881463850613e-05, + "loss": 1.0429, + "step": 779000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2692592964051836e-05, + "loss": 0.9851, + "step": 780000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2582712452079226e-05, + "loss": 0.8493, + "step": 781000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2473241129568458e-05, + "loss": 0.7932, + "step": 782000 + }, + { + "epoch": 1.17, + "learning_rate": 1.236428904923082e-05, + "loss": 0.702, + "step": 783000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2255639280464832e-05, + "loss": 0.6618, + "step": 784000 + }, + { + "epoch": 1.17, + "learning_rate": 1.2147402277980474e-05, + "loss": 0.8469, + "step": 785000 + }, + { + "epoch": 1.18, + "learning_rate": 1.2039686841331998e-05, + "loss": 0.9126, + "step": 786000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1932278502155054e-05, + "loss": 0.8954, + "step": 787000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1825286465481434e-05, + "loss": 0.9188, + "step": 788000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1718818267007175e-05, + "loss": 0.9873, + "step": 789000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1612661921699398e-05, + "loss": 0.9544, + "step": 790000 + }, + { + "epoch": 1.18, + "learning_rate": 1.150703090064395e-05, + "loss": 0.801, + "step": 791000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1401714885682025e-05, + "loss": 0.8218, + "step": 792000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1296820975382121e-05, + "loss": 0.7743, + "step": 793000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1192454575710875e-05, + "loss": 0.7675, + "step": 794000 + }, + { + "epoch": 1.18, + "learning_rate": 1.1088407886452029e-05, + "loss": 0.9008, + "step": 795000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0984890136358416e-05, + "loss": 0.8879, + "step": 796000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0881695214929688e-05, + "loss": 0.9673, + "step": 797000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0778928085014794e-05, + "loss": 1.0564, + "step": 798000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0676691994057019e-05, + "loss": 1.0726, + "step": 799000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0574783383421865e-05, + "loss": 0.9626, + "step": 800000 + }, + { + "epoch": 1.19, + "eval_accuracy": 0.8352768932764173, + "eval_loss": 0.7242327928543091, + "eval_runtime": 12.0402, + "eval_samples_per_second": 415.274, + "eval_steps_per_second": 0.831, + "step": 800000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0473407182373813e-05, + "loss": 0.9342, + "step": 801000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0372361544374464e-05, + "loss": 0.8919, + "step": 802000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0271849663326171e-05, + "loss": 0.8567, + "step": 803000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0171671413607247e-05, + "loss": 0.9096, + "step": 804000 + }, + { + "epoch": 1.19, + "learning_rate": 1.0072028244878407e-05, + "loss": 1.0644, + "step": 805000 + }, + { + "epoch": 1.2, + "learning_rate": 9.97272176118008e-06, + "loss": 1.0327, + "step": 806000 + }, + { + "epoch": 1.2, + "learning_rate": 9.873853009383633e-06, + "loss": 1.076, + "step": 807000 + }, + { + "epoch": 1.2, + "learning_rate": 9.775423070701888e-06, + "loss": 1.0958, + "step": 808000 + }, + { + "epoch": 1.2, + "learning_rate": 9.67753079151617e-06, + "loss": 1.0053, + "step": 809000 + }, + { + "epoch": 1.2, + "learning_rate": 9.57998126200062e-06, + "loss": 0.8648, + "step": 810000 + }, + { + "epoch": 1.2, + "learning_rate": 9.482970645690526e-06, + "loss": 0.8919, + "step": 811000 + }, + { + "epoch": 1.2, + "learning_rate": 9.38630578820755e-06, + "loss": 0.8439, + "step": 812000 + }, + { + "epoch": 1.2, + "learning_rate": 9.290181074089233e-06, + "loss": 0.7462, + "step": 813000 + }, + { + "epoch": 1.2, + "learning_rate": 9.194405112845483e-06, + "loss": 0.8975, + "step": 814000 + }, + { + "epoch": 1.2, + "learning_rate": 9.099170501521598e-06, + "loss": 0.9167, + "step": 815000 + }, + { + "epoch": 1.21, + "learning_rate": 9.00428762187942e-06, + "loss": 0.8518, + "step": 816000 + }, + { + "epoch": 1.21, + "learning_rate": 8.909947275055568e-06, + "loss": 0.8843, + "step": 817000 + }, + { + "epoch": 1.21, + "learning_rate": 8.815961623350038e-06, + "loss": 0.926, + "step": 818000 + }, + { + "epoch": 1.21, + "learning_rate": 8.722519663652901e-06, + "loss": 0.9408, + "step": 819000 + }, + { + "epoch": 1.21, + "learning_rate": 8.629435347010716e-06, + "loss": 0.7628, + "step": 820000 + }, + { + "epoch": 1.21, + "learning_rate": 8.536803452235437e-06, + "loss": 0.7669, + "step": 821000 + }, + { + "epoch": 1.21, + "learning_rate": 8.444624992334588e-06, + "loss": 0.7749, + "step": 822000 + }, + { + "epoch": 1.21, + "learning_rate": 8.352992472045557e-06, + "loss": 0.7575, + "step": 823000 + }, + { + "epoch": 1.21, + "learning_rate": 8.26172344512513e-06, + "loss": 0.899, + "step": 824000 + }, + { + "epoch": 1.21, + "learning_rate": 8.171001445569593e-06, + "loss": 0.9519, + "step": 825000 + }, + { + "epoch": 1.22, + "learning_rate": 8.080645840041112e-06, + "loss": 0.9472, + "step": 826000 + }, + { + "epoch": 1.22, + "learning_rate": 7.990838325725758e-06, + "loss": 0.9576, + "step": 827000 + }, + { + "epoch": 1.22, + "learning_rate": 7.901400090084665e-06, + "loss": 0.9493, + "step": 828000 + }, + { + "epoch": 1.22, + "learning_rate": 7.81251098555364e-06, + "loss": 0.8743, + "step": 829000 + }, + { + "epoch": 1.22, + "learning_rate": 7.723994028206778e-06, + "loss": 0.8593, + "step": 830000 + }, + { + "epoch": 1.22, + "learning_rate": 7.636027217870157e-06, + "loss": 0.8767, + "step": 831000 + }, + { + "epoch": 1.22, + "learning_rate": 7.54843540696496e-06, + "loss": 0.853, + "step": 832000 + }, + { + "epoch": 1.22, + "learning_rate": 7.461394734929022e-06, + "loss": 0.8742, + "step": 833000 + }, + { + "epoch": 1.22, + "learning_rate": 7.374731898184495e-06, + "loss": 1.0273, + "step": 834000 + }, + { + "epoch": 1.22, + "learning_rate": 7.2886211680837424e-06, + "loss": 1.0271, + "step": 835000 + }, + { + "epoch": 1.23, + "learning_rate": 7.202891092623126e-06, + "loss": 1.049, + "step": 836000 + }, + { + "epoch": 1.23, + "learning_rate": 7.11771406745404e-06, + "loss": 1.0773, + "step": 837000 + }, + { + "epoch": 1.23, + "learning_rate": 7.032920499639423e-06, + "loss": 1.0882, + "step": 838000 + }, + { + "epoch": 1.23, + "learning_rate": 6.94868090159605e-06, + "loss": 0.9609, + "step": 839000 + }, + { + "epoch": 1.23, + "learning_rate": 6.864827546864583e-06, + "loss": 0.9468, + "step": 840000 + }, + { + "epoch": 1.23, + "learning_rate": 6.781529057175845e-06, + "loss": 0.8922, + "step": 841000 + }, + { + "epoch": 1.23, + "learning_rate": 6.698619579877818e-06, + "loss": 0.7689, + "step": 842000 + }, + { + "epoch": 1.23, + "learning_rate": 6.616183639538559e-06, + "loss": 0.852, + "step": 843000 + }, + { + "epoch": 1.23, + "learning_rate": 6.534385586581854e-06, + "loss": 1.0235, + "step": 844000 + }, + { + "epoch": 1.23, + "learning_rate": 6.452898467929852e-06, + "loss": 0.9675, + "step": 845000 + }, + { + "epoch": 1.24, + "learning_rate": 6.371887573403335e-06, + "loss": 0.9587, + "step": 846000 + }, + { + "epoch": 1.24, + "learning_rate": 6.29135378892447e-06, + "loss": 1.0189, + "step": 847000 + }, + { + "epoch": 1.24, + "learning_rate": 6.211377811943364e-06, + "loss": 0.9343, + "step": 848000 + }, + { + "epoch": 1.24, + "learning_rate": 6.131879743067948e-06, + "loss": 0.7473, + "step": 849000 + }, + { + "epoch": 1.24, + "learning_rate": 6.0528604499385185e-06, + "loss": 0.7263, + "step": 850000 + }, + { + "epoch": 1.24, + "eval_accuracy": 0.8377542903972233, + "eval_loss": 0.7093546986579895, + "eval_runtime": 11.2849, + "eval_samples_per_second": 443.071, + "eval_steps_per_second": 0.886, + "step": 850000 + }, + { + "epoch": 1.24, + "learning_rate": 5.974242417283055e-06, + "loss": 0.7407, + "step": 851000 + }, + { + "epoch": 1.24, + "learning_rate": 5.896105843258959e-06, + "loss": 0.7019, + "step": 852000 + }, + { + "epoch": 1.24, + "learning_rate": 5.818451582355289e-06, + "loss": 0.8287, + "step": 853000 + }, + { + "epoch": 1.24, + "learning_rate": 5.741280483786604e-06, + "loss": 0.9027, + "step": 854000 + }, + { + "epoch": 1.24, + "learning_rate": 5.664593391483675e-06, + "loss": 0.9336, + "step": 855000 + }, + { + "epoch": 1.25, + "learning_rate": 5.588467103873829e-06, + "loss": 0.9669, + "step": 856000 + }, + { + "epoch": 1.25, + "learning_rate": 5.5127500486207735e-06, + "loss": 0.9644, + "step": 857000 + }, + { + "epoch": 1.25, + "learning_rate": 5.437594486071778e-06, + "loss": 0.9871, + "step": 858000 + }, + { + "epoch": 1.25, + "learning_rate": 5.362850776666601e-06, + "loss": 0.7998, + "step": 859000 + }, + { + "epoch": 1.25, + "learning_rate": 5.288595211973824e-06, + "loss": 0.799, + "step": 860000 + }, + { + "epoch": 1.25, + "learning_rate": 5.214902126145504e-06, + "loss": 0.7326, + "step": 861000 + }, + { + "epoch": 1.25, + "learning_rate": 5.141624791506633e-06, + "loss": 0.7172, + "step": 862000 + }, + { + "epoch": 1.25, + "learning_rate": 5.068910562340645e-06, + "loss": 0.956, + "step": 863000 + }, + { + "epoch": 1.25, + "learning_rate": 4.9966146599391025e-06, + "loss": 1.0129, + "step": 864000 + }, + { + "epoch": 1.25, + "learning_rate": 4.924882465011432e-06, + "loss": 1.0269, + "step": 865000 + }, + { + "epoch": 1.26, + "learning_rate": 4.853571154142544e-06, + "loss": 1.0608, + "step": 866000 + }, + { + "epoch": 1.26, + "learning_rate": 4.782824128114777e-06, + "loss": 1.0748, + "step": 867000 + }, + { + "epoch": 1.26, + "learning_rate": 4.7125005250476976e-06, + "loss": 0.9914, + "step": 868000 + }, + { + "epoch": 1.26, + "learning_rate": 4.642741759530028e-06, + "loss": 0.9261, + "step": 869000 + }, + { + "epoch": 1.26, + "learning_rate": 4.573408937371581e-06, + "loss": 0.8934, + "step": 870000 + }, + { + "epoch": 1.26, + "learning_rate": 4.504572893135017e-06, + "loss": 0.868, + "step": 871000 + }, + { + "epoch": 1.26, + "learning_rate": 4.436302469348041e-06, + "loss": 0.8991, + "step": 872000 + }, + { + "epoch": 1.26, + "learning_rate": 4.368461735204643e-06, + "loss": 1.0529, + "step": 873000 + }, + { + "epoch": 1.26, + "learning_rate": 4.3011871124621965e-06, + "loss": 1.0182, + "step": 874000 + }, + { + "epoch": 1.26, + "learning_rate": 4.23434465301864e-06, + "loss": 0.9739, + "step": 875000 + }, + { + "epoch": 1.27, + "learning_rate": 4.168068771188594e-06, + "loss": 1.0152, + "step": 876000 + }, + { + "epoch": 1.27, + "learning_rate": 4.10222750742742e-06, + "loss": 1.0456, + "step": 877000 + }, + { + "epoch": 1.27, + "learning_rate": 4.036888174352743e-06, + "loss": 0.8894, + "step": 878000 + }, + { + "epoch": 1.27, + "learning_rate": 3.972116071885162e-06, + "loss": 0.8262, + "step": 879000 + }, + { + "epoch": 1.27, + "learning_rate": 3.907782234601492e-06, + "loss": 0.8263, + "step": 880000 + }, + { + "epoch": 1.27, + "learning_rate": 3.844016032197534e-06, + "loss": 0.6921, + "step": 881000 + }, + { + "epoch": 1.27, + "learning_rate": 3.78069050206184e-06, + "loss": 0.75, + "step": 882000 + }, + { + "epoch": 1.27, + "learning_rate": 3.7179329862731317e-06, + "loss": 0.8619, + "step": 883000 + }, + { + "epoch": 1.27, + "learning_rate": 3.6556185305799074e-06, + "loss": 0.8767, + "step": 884000 + }, + { + "epoch": 1.27, + "learning_rate": 3.593872443878982e-06, + "loss": 0.92, + "step": 885000 + }, + { + "epoch": 1.28, + "learning_rate": 3.532571785739236e-06, + "loss": 0.9303, + "step": 886000 + }, + { + "epoch": 1.28, + "learning_rate": 3.4718398263996642e-06, + "loss": 0.9353, + "step": 887000 + }, + { + "epoch": 1.28, + "learning_rate": 3.411555644621961e-06, + "loss": 0.834, + "step": 888000 + }, + { + "epoch": 1.28, + "learning_rate": 3.3517809466076076e-06, + "loss": 0.8077, + "step": 889000 + }, + { + "epoch": 1.28, + "learning_rate": 3.2925753955738435e-06, + "loss": 0.7661, + "step": 890000 + }, + { + "epoch": 1.28, + "learning_rate": 3.2338211094595227e-06, + "loss": 0.7519, + "step": 891000 + }, + { + "epoch": 1.28, + "learning_rate": 3.1755782507849554e-06, + "loss": 0.8897, + "step": 892000 + }, + { + "epoch": 1.28, + "learning_rate": 3.117904931292237e-06, + "loss": 0.9311, + "step": 893000 + }, + { + "epoch": 1.28, + "learning_rate": 3.060686319691808e-06, + "loss": 0.8915, + "step": 894000 + }, + { + "epoch": 1.28, + "learning_rate": 3.004037477585625e-06, + "loss": 0.9306, + "step": 895000 + }, + { + "epoch": 1.29, + "learning_rate": 2.9478456134844844e-06, + "loss": 1.0244, + "step": 896000 + }, + { + "epoch": 1.29, + "learning_rate": 2.892223724291321e-06, + "loss": 1.029, + "step": 897000 + }, + { + "epoch": 1.29, + "learning_rate": 2.8370610632462602e-06, + "loss": 0.8896, + "step": 898000 + }, + { + "epoch": 1.29, + "learning_rate": 2.7824685576152752e-06, + "loss": 0.8942, + "step": 899000 + }, + { + "epoch": 1.29, + "learning_rate": 2.728337510207396e-06, + "loss": 0.8578, + "step": 900000 + }, + { + "epoch": 1.29, + "eval_accuracy": 0.8368069978989446, + "eval_loss": 0.7140042185783386, + "eval_runtime": 10.5028, + "eval_samples_per_second": 476.062, + "eval_steps_per_second": 0.952, + "step": 900000 + }, + { + "epoch": 1.29, + "learning_rate": 2.6747767738041995e-06, + "loss": 0.861, + "step": 901000 + }, + { + "epoch": 1.29, + "learning_rate": 2.621679705533614e-06, + "loss": 1.0138, + "step": 902000 + }, + { + "epoch": 1.29, + "learning_rate": 2.569100758983095e-06, + "loss": 1.0312, + "step": 903000 + }, + { + "epoch": 1.29, + "learning_rate": 2.517092310118435e-06, + "loss": 1.0481, + "step": 904000 + }, + { + "epoch": 1.29, + "learning_rate": 2.4655508067719378e-06, + "loss": 1.0777, + "step": 905000 + }, + { + "epoch": 1.3, + "learning_rate": 2.4145798943795206e-06, + "loss": 1.0943, + "step": 906000 + }, + { + "epoch": 1.3, + "learning_rate": 2.3640780865753555e-06, + "loss": 0.9609, + "step": 907000 + }, + { + "epoch": 1.3, + "learning_rate": 2.314146938058942e-06, + "loss": 0.8363, + "step": 908000 + }, + { + "epoch": 1.3, + "learning_rate": 2.2646870327012294e-06, + "loss": 0.842, + "step": 909000 + }, + { + "epoch": 1.3, + "learning_rate": 2.215797830027422e-06, + "loss": 0.8333, + "step": 910000 + }, + { + "epoch": 1.3, + "learning_rate": 2.1673819884896694e-06, + "loss": 0.8255, + "step": 911000 + }, + { + "epoch": 1.3, + "learning_rate": 2.119489236799016e-06, + "loss": 0.9393, + "step": 912000 + }, + { + "epoch": 1.3, + "learning_rate": 2.0721200987029453e-06, + "loss": 0.9491, + "step": 913000 + }, + { + "epoch": 1.3, + "learning_rate": 2.025321675254488e-06, + "loss": 0.8986, + "step": 914000 + }, + { + "epoch": 1.3, + "learning_rate": 1.9790007877817653e-06, + "loss": 0.8977, + "step": 915000 + }, + { + "epoch": 1.31, + "learning_rate": 1.933250583520735e-06, + "loss": 0.8944, + "step": 916000 + }, + { + "epoch": 1.31, + "learning_rate": 1.887979970871323e-06, + "loss": 0.8071, + "step": 917000 + }, + { + "epoch": 1.31, + "learning_rate": 1.8432355035647219e-06, + "loss": 0.7774, + "step": 918000 + }, + { + "epoch": 1.31, + "learning_rate": 1.799061625536469e-06, + "loss": 0.7421, + "step": 919000 + }, + { + "epoch": 1.31, + "learning_rate": 1.7553703837534352e-06, + "loss": 0.7138, + "step": 920000 + }, + { + "epoch": 1.31, + "learning_rate": 1.712249637465102e-06, + "loss": 0.86, + "step": 921000 + }, + { + "epoch": 1.31, + "learning_rate": 1.6696135304913818e-06, + "loss": 0.9401, + "step": 922000 + }, + { + "epoch": 1.31, + "learning_rate": 1.627547800298329e-06, + "loss": 0.9108, + "step": 923000 + }, + { + "epoch": 1.31, + "learning_rate": 1.5859686913107963e-06, + "loss": 0.9746, + "step": 924000 + }, + { + "epoch": 1.31, + "learning_rate": 1.5449190304130446e-06, + "loss": 0.975, + "step": 925000 + }, + { + "epoch": 1.32, + "learning_rate": 1.5044395214499573e-06, + "loss": 0.9505, + "step": 926000 + }, + { + "epoch": 1.32, + "learning_rate": 1.4644495671149062e-06, + "loss": 0.7784, + "step": 927000 + }, + { + "epoch": 1.32, + "learning_rate": 1.4250295836934214e-06, + "loss": 0.7832, + "step": 928000 + }, + { + "epoch": 1.32, + "learning_rate": 1.3861010834623766e-06, + "loss": 0.8266, + "step": 929000 + }, + { + "epoch": 1.32, + "learning_rate": 1.3477423482163465e-06, + "loss": 0.8174, + "step": 930000 + }, + { + "epoch": 1.32, + "learning_rate": 1.3098770032456076e-06, + "loss": 0.9858, + "step": 931000 + }, + { + "epoch": 1.32, + "learning_rate": 1.272581192432859e-06, + "loss": 1.0364, + "step": 932000 + }, + { + "epoch": 1.32, + "learning_rate": 1.2357806574193386e-06, + "loss": 1.029, + "step": 933000 + }, + { + "epoch": 1.32, + "learning_rate": 1.1995494008484487e-06, + "loss": 1.0567, + "step": 934000 + }, + { + "epoch": 1.32, + "learning_rate": 1.1638152839573868e-06, + "loss": 1.0777, + "step": 935000 + }, + { + "epoch": 1.33, + "learning_rate": 1.12861523175814e-06, + "loss": 1.0355, + "step": 936000 + }, + { + "epoch": 1.33, + "learning_rate": 1.0939840277111369e-06, + "loss": 0.9177, + "step": 937000 + }, + { + "epoch": 1.33, + "learning_rate": 1.0598527188610874e-06, + "loss": 0.9093, + "step": 938000 + }, + { + "epoch": 1.33, + "learning_rate": 1.0262899402720737e-06, + "loss": 0.8764, + "step": 939000 + }, + { + "epoch": 1.33, + "learning_rate": 9.93228866292617e-07, + "loss": 0.8315, + "step": 940000 + }, + { + "epoch": 1.33, + "learning_rate": 9.607359798384785e-07, + "loss": 0.943, + "step": 941000 + }, + { + "epoch": 1.33, + "learning_rate": 9.287465854822597e-07, + "loss": 0.9774, + "step": 942000 + }, + { + "epoch": 1.33, + "learning_rate": 8.973250110861309e-07, + "loss": 1.0164, + "step": 943000 + }, + { + "epoch": 1.33, + "learning_rate": 8.664086942739291e-07, + "loss": 0.9901, + "step": 944000 + }, + { + "epoch": 1.33, + "learning_rate": 8.360296945443436e-07, + "loss": 0.9853, + "step": 945000 + }, + { + "epoch": 1.34, + "learning_rate": 8.061883441172646e-07, + "loss": 0.8922, + "step": 946000 + }, + { + "epoch": 1.34, + "learning_rate": 7.769140038817014e-07, + "loss": 0.7675, + "step": 947000 + }, + { + "epoch": 1.34, + "learning_rate": 7.48148386742864e-07, + "loss": 0.7123, + "step": 948000 + }, + { + "epoch": 1.34, + "learning_rate": 7.199213799624294e-07, + "loss": 0.6682, + "step": 949000 + }, + { + "epoch": 1.34, + "learning_rate": 6.922607110228129e-07, + "loss": 0.7693, + "step": 950000 + }, + { + "epoch": 1.34, + "eval_accuracy": 0.8377192585963231, + "eval_loss": 0.7091149091720581, + "eval_runtime": 9.9645, + "eval_samples_per_second": 501.78, + "eval_steps_per_second": 1.004, + "step": 950000 + }, + { + "epoch": 1.34, + "learning_rate": 6.651113057525916e-07, + "loss": 0.899, + "step": 951000 + }, + { + "epoch": 1.34, + "learning_rate": 6.385277592210082e-07, + "loss": 0.8642, + "step": 952000 + }, + { + "epoch": 1.34, + "learning_rate": 6.124571418766378e-07, + "loss": 0.9313, + "step": 953000 + }, + { + "epoch": 1.34, + "learning_rate": 5.869518794409723e-07, + "loss": 0.9976, + "step": 954000 + }, + { + "epoch": 1.34, + "learning_rate": 5.619611892955956e-07, + "loss": 0.9632, + "step": 955000 + }, + { + "epoch": 1.35, + "learning_rate": 5.375353255232474e-07, + "loss": 0.833, + "step": 956000 + }, + { + "epoch": 1.35, + "learning_rate": 5.136256546577067e-07, + "loss": 0.8247, + "step": 957000 + }, + { + "epoch": 1.35, + "learning_rate": 4.902571588535909e-07, + "loss": 0.789, + "step": 958000 + }, + { + "epoch": 1.35, + "learning_rate": 4.674526502020382e-07, + "loss": 0.7219, + "step": 959000 + }, + { + "epoch": 1.35, + "learning_rate": 4.451667234591728e-07, + "loss": 0.8131, + "step": 960000 + }, + { + "epoch": 1.35, + "learning_rate": 4.234441936661282e-07, + "loss": 0.9334, + "step": 961000 + }, + { + "epoch": 1.35, + "learning_rate": 4.0224180986853655e-07, + "loss": 0.9777, + "step": 962000 + }, + { + "epoch": 1.35, + "learning_rate": 3.8160220819785095e-07, + "loss": 1.0358, + "step": 963000 + }, + { + "epoch": 1.35, + "learning_rate": 3.6148429387927175e-07, + "loss": 1.0623, + "step": 964000 + }, + { + "epoch": 1.35, + "learning_rate": 3.419285222713675e-07, + "loss": 1.0595, + "step": 965000 + }, + { + "epoch": 1.36, + "learning_rate": 3.228959565747369e-07, + "loss": 0.9013, + "step": 966000 + }, + { + "epoch": 1.36, + "learning_rate": 3.044248696072116e-07, + "loss": 0.9035, + "step": 967000 + }, + { + "epoch": 1.36, + "learning_rate": 2.86478484246272e-07, + "loss": 0.865, + "step": 968000 + }, + { + "epoch": 1.36, + "learning_rate": 2.690928890965172e-07, + "loss": 0.8449, + "step": 969000 + }, + { + "epoch": 1.36, + "learning_rate": 2.5223346831947934e-07, + "loss": 1.0131, + "step": 970000 + }, + { + "epoch": 1.36, + "learning_rate": 2.359180811469297e-07, + "loss": 1.0522, + "step": 971000 + }, + { + "epoch": 1.36, + "learning_rate": 2.2016240528467956e-07, + "loss": 1.0527, + "step": 972000 + }, + { + "epoch": 1.36, + "learning_rate": 2.0493507016841605e-07, + "loss": 1.0244, + "step": 973000 + }, + { + "epoch": 1.36, + "learning_rate": 1.902666966315303e-07, + "loss": 0.9939, + "step": 974000 + }, + { + "epoch": 1.36, + "learning_rate": 1.7612807899859974e-07, + "loss": 0.975, + "step": 975000 + }, + { + "epoch": 1.37, + "learning_rate": 1.6254764878778085e-07, + "loss": 0.8924, + "step": 976000 + }, + { + "epoch": 1.37, + "learning_rate": 1.4949836648880388e-07, + "loss": 0.8095, + "step": 977000 + }, + { + "epoch": 1.37, + "learning_rate": 1.3699424128894024e-07, + "loss": 0.7741, + "step": 978000 + }, + { + "epoch": 1.37, + "learning_rate": 1.2503540993129005e-07, + "loss": 0.7888, + "step": 979000 + }, + { + "epoch": 1.37, + "learning_rate": 1.1363314412082271e-07, + "loss": 0.8592, + "step": 980000 + }, + { + "epoch": 1.37, + "learning_rate": 1.0276474121272417e-07, + "loss": 0.8308, + "step": 981000 + }, + { + "epoch": 1.37, + "learning_rate": 9.245205661059241e-08, + "loss": 0.8657, + "step": 982000 + }, + { + "epoch": 1.37, + "learning_rate": 8.2674557095902e-08, + "loss": 0.9413, + "step": 983000 + }, + { + "epoch": 1.37, + "learning_rate": 7.345190436134352e-08, + "loss": 0.9376, + "step": 984000 + }, + { + "epoch": 1.37, + "learning_rate": 6.476573551197352e-08, + "loss": 0.8103, + "step": 985000 + }, + { + "epoch": 1.38, + "learning_rate": 5.6633517670373616e-08, + "loss": 0.8122, + "step": 986000 + }, + { + "epoch": 1.38, + "learning_rate": 4.903905906762374e-08, + "loss": 0.809, + "step": 987000 + }, + { + "epoch": 1.38, + "learning_rate": 4.1990856170864845e-08, + "loss": 0.7404, + "step": 988000 + }, + { + "epoch": 1.38, + "learning_rate": 3.549521501085562e-08, + "loss": 0.8188, + "step": 989000 + }, + { + "epoch": 1.38, + "learning_rate": 2.9539202348127794e-08, + "loss": 0.9535, + "step": 990000 + }, + { + "epoch": 1.38, + "learning_rate": 2.412965863871075e-08, + "loss": 0.9178, + "step": 991000 + }, + { + "epoch": 1.38, + "learning_rate": 1.9271233047113424e-08, + "loss": 0.9305, + "step": 992000 + }, + { + "epoch": 1.38, + "learning_rate": 1.4954252135407352e-08, + "loss": 0.938, + "step": 993000 + }, + { + "epoch": 1.38, + "learning_rate": 1.1187396973016962e-08, + "loss": 0.969, + "step": 994000 + }, + { + "epoch": 1.38, + "learning_rate": 7.963167508967528e-09, + "loss": 0.8864, + "step": 995000 + }, + { + "epoch": 1.39, + "learning_rate": 5.288047385498818e-09, + "loss": 0.8899, + "step": 996000 + }, + { + "epoch": 1.39, + "learning_rate": 3.156710266344343e-09, + "loss": 0.8701, + "step": 997000 + }, + { + "epoch": 1.39, + "learning_rate": 1.5721306152016724e-09, + "loss": 0.8305, + "step": 998000 + }, + { + "epoch": 1.39, + "learning_rate": 5.350904473455653e-10, + "loss": 0.961, + "step": 999000 + }, + { + "epoch": 1.39, + "learning_rate": 4.3524949094875254e-11, + "loss": 1.0488, + "step": 1000000 + }, + { + "epoch": 1.39, + "eval_accuracy": 0.8387476162115661, + "eval_loss": 0.7079769372940063, + "eval_runtime": 10.2424, + "eval_samples_per_second": 488.169, + "eval_steps_per_second": 0.976, + "step": 1000000 + }, + { + "epoch": 1.39, + "step": 1000000, + "total_flos": 6.738157336654774e+19, + "train_loss": 0.18125205249023438, + "train_runtime": 269324.8897, + "train_samples_per_second": 3802.099, + "train_steps_per_second": 3.713 + } + ], + "max_steps": 1000000, + "num_train_epochs": 9223372036854775807, + "total_flos": 6.738157336654774e+19, + "trial_name": null, + "trial_params": null +}